diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..a38aef2373317497f3b225f882295ff108de3fbf 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,51 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_117_IMG_8176.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_127_1E703639-9169-4EB4-8A5D-3B1E226F639B.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_14_IMG_20220721_150241.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_153_IMG_20240411_102841.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_155_IMG_20240403_094547.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_158_IMG_20240416_141427.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_159_IMG_20240416_140243.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_187_mmexport1713446177339.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_19_IMG_0395.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_217_IMG_3854.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_22_IMG_20240404_135935.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_236_IMG_4742.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_34_IMG_0400.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_40_IMG_20170613_190017.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_61_mmexport1550554859117.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_64_IMG_20240409_181047.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_68_IMG_0971.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_70_IMG_0581.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_72_IMG_0618.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_80_IMG_6323.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_87_IMG_5316.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14456664_88_wx_camera_1712730236546.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_108_IMG_6205.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_109_IMG_5997.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_112_IMG_3570.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_117_IMG_1362.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_122_53D0E884-E012-4FFA-8BD7-8F5666A53123.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_144_IMG_8800.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_14_IMG_3852.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_162_IMG_0356.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_168_D4D36C59-2D68-4354-B05C-B3065F6581AA.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_174_IMG_8019.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_183_IMG_20240502_122921.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_209_IMG_3105.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_211_IMG_9139.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_237_IMG_5909.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_257_79874AA6-4F23-4AD6-96E2-7F64039A81C0.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_261_0E3192C7-1B0D-4C4D-9788-5E7A5E6E92BA.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_262_75B3AEDF-C705-400B-AE86-3FA7A92B624C.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_265_1C4A3265-A18E-46C2-A264-4384B85A49C0.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_44_930F3702-FC28-498F-A5A7-9E01AA5AEA15.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_55_IMG_8533.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_65_IMG_20240424_121225.jpg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_75_IMG_8637.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_79_IMG_2612.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_80_IMG_2613.jpeg filter=lfs diff=lfs merge=lfs -text +demo/food/14521898_82_IMG_9847.jpeg filter=lfs diff=lfs merge=lfs -text +outputs/uploaded/4.jpg filter=lfs diff=lfs merge=lfs -text diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..59dfd203e99164286d0d6b0362776dc86353547f --- /dev/null +++ b/demo.py @@ -0,0 +1,119 @@ +import os +import random +import numpy as np +import torch +import torch.backends.cudnn as cudnn +import gradio as gr + +from utils import load_json, init_logger +from demo import ConversationalAgent, CustomTheme + +FOOD_EXAMPLES = "demo/food_for_demo.json" +# MODEL_PATH = "/root/share/new_models/OpenGVLab/InternVL2-2B" +MODEL_PATH = "/root/xtuner/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10" +OUTPUT_PATH = "./outputs" + +def setup_seeds(): + seed = 42 + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + cudnn.benchmark = False + cudnn.deterministic = True + + +def main(): + setup_seeds() + # logging + init_logger(OUTPUT_PATH) + # food examples + food_examples = load_json(FOOD_EXAMPLES) + + agent = ConversationalAgent(model_path=MODEL_PATH, + outputs_dir=OUTPUT_PATH) + + theme = CustomTheme() + + titles = [ + """
书生大模型实战营
""" ## Kalam:wght@700 + """
「进阶岛」InternVL 多模态模型部署微调实践
""" + ] + + language = """Language: 中文 and English""" + with gr.Blocks(theme) as demo_chatbot: + for title in titles: + gr.Markdown(title) + # gr.Markdown(article) + gr.Markdown(language) + + with gr.Row(): + with gr.Column(scale=3): + start_btn = gr.Button("Start Chat", variant="primary", interactive=True) + clear_btn = gr.Button("Clear Context", interactive=False) + image = gr.Image(type="pil", interactive=False) + upload_btn = gr.Button("🖼️ Upload Image", interactive=False) + + with gr.Accordion("Generation Settings"): + top_p = gr.Slider(minimum=0, maximum=1, step=0.1, + value=0.8, + interactive=True, + label='top-p value', + visible=True) + + temperature = gr.Slider(minimum=0, maximum=1.5, step=0.1, + value=0.8, + interactive=True, + label='temperature', + visible=True) + + with gr.Column(scale=7): + chat_state = gr.State() + chatbot = gr.Chatbot(label='InternVL2', height=800, avatar_images=((os.path.join(os.path.dirname(__file__), 'demo/user.png')), (os.path.join(os.path.dirname(__file__), "demo/bot.png")))) + text_input = gr.Textbox(label='User', placeholder="Please click the button to start chat!", interactive=False) + gr.Markdown("### 输入示例") + def on_text_change(text): + return gr.update(interactive=True) + text_input.change(fn=on_text_change, inputs=text_input, outputs=text_input) + gr.Examples( + examples=[["图片中的食物通常属于哪个菜系?"], + ["如果让你简单形容一下品尝图片中的食物的滋味,你会描述它"], + ["去哪个地方游玩时应该品尝当地的特色美食图片中的食物?"], + ["食用图片中的食物时,一般它上菜或摆盘时的特点是?"]], + inputs=[text_input] + ) + + with gr.Row(): + gr.Markdown("### 食物快捷栏") + with gr.Row(): + example_xinjiang_food = gr.Examples(examples=food_examples["新疆菜"], inputs=image, label="新疆菜") + example_sichuan_food = gr.Examples(examples=food_examples["川菜(四川,重庆)"], inputs=image, label="川菜(四川,重庆)") + example_xibei_food = gr.Examples(examples=food_examples["西北菜 (陕西,甘肃等地)"], inputs=image, label="西北菜 (陕西,甘肃等地)") + with gr.Row(): + example_guizhou_food = gr.Examples(examples=food_examples["黔菜 (贵州)"], inputs=image, label="黔菜 (贵州)") + example_jiangsu_food = gr.Examples(examples=food_examples["苏菜(江苏)"], inputs=image, label="苏菜(江苏)") + example_guangdong_food = gr.Examples(examples=food_examples["粤菜(广东等地)"], inputs=image, label="粤菜(广东等地)") + with gr.Row(): + example_hunan_food = gr.Examples(examples=food_examples["湘菜(湖南)"], inputs=image, label="湘菜(湖南)") + example_fujian_food = gr.Examples(examples=food_examples["闽菜(福建)"], inputs=image, label="闽菜(福建)") + example_zhejiang_food = gr.Examples(examples=food_examples["浙菜(浙江)"], inputs=image, label="浙菜(浙江)") + with gr.Row(): + example_dongbei_food = gr.Examples(examples=food_examples["东北菜 (黑龙江等地)"], inputs=image, label="东北菜 (黑龙江等地)") + + + start_btn.click(agent.start_chat, [chat_state], [text_input, start_btn, clear_btn, image, upload_btn, chat_state]) + clear_btn.click(agent.restart_chat, [chat_state], [chatbot, text_input, start_btn, clear_btn, image, upload_btn, chat_state], queue=False) + upload_btn.click(agent.upload_image, [image, chatbot, chat_state], [image, chatbot, chat_state]) + text_input.submit( + agent.respond, + inputs=[text_input, image, chatbot, top_p, temperature, chat_state], + outputs=[text_input, image, chatbot, chat_state] + ) + + demo_chatbot.launch(share=True, server_name="127.0.0.1", server_port=1096, allowed_paths=['./']) + demo_chatbot.queue() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/demo/__init__.py b/demo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..743e4ac650cd862cafd3e1fa880896933afdb26b --- /dev/null +++ b/demo/__init__.py @@ -0,0 +1,8 @@ +from demo.agent import ConversationalAgent +from demo.theme import CustomTheme + + +__all__ = [ + "ConversationalAgent", + "CustomTheme", +] \ No newline at end of file diff --git a/demo/__pycache__/__init__.cpython-310.pyc b/demo/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2f8eef6c1a68f39e8fb198f180be09cd50ce677e Binary files /dev/null and b/demo/__pycache__/__init__.cpython-310.pyc differ diff --git a/demo/__pycache__/agent.cpython-310.pyc b/demo/__pycache__/agent.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6c47cbc55031806083a05da9af5ce39008ce5846 Binary files /dev/null and b/demo/__pycache__/agent.cpython-310.pyc differ diff --git a/demo/__pycache__/theme.cpython-310.pyc b/demo/__pycache__/theme.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fcebc0afe2ead27d10fac23e05308d97abc4cdda Binary files /dev/null and b/demo/__pycache__/theme.cpython-310.pyc differ diff --git a/demo/agent.py b/demo/agent.py new file mode 100644 index 0000000000000000000000000000000000000000..2b98b7e9dcbd6e0d2d67cb830031a85ea75c4f89 --- /dev/null +++ b/demo/agent.py @@ -0,0 +1,105 @@ +import os +import logging +from datetime import datetime + +import gradio as gr +from PIL import Image + +from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig, ChatTemplateConfig +from lmdeploy.vl import load_image + +class ConversationalAgent: + def __init__(self, + model_path, + outputs_dir) -> None: + self.pipe = pipeline(model_path, + chat_template_config=ChatTemplateConfig(model_name='internvl2-internlm2'), + backend_config=TurbomindEngineConfig(session_len=8192)) + self.uploaded_images_storage = os.path.join(outputs_dir, "uploaded") + self.uploaded_images_storage = os.path.abspath(self.uploaded_images_storage) + os.makedirs(self.uploaded_images_storage, exist_ok=True) + self.sess = None + + def start_chat(self, chat_state): + self.sess = None + self.context = "" + self.current_image_id = -1 + self.image_list = [] + self.pixel_values_list = [] + self.seen_image_idx = [] + logging.info("=" * 30 + "Start Chat" + "=" * 30) + + return ( + #gr.update(interactive=False), # [image] Image + gr.update(interactive=True, placeholder='input the text.'), # [input_text] Textbox + gr.update(interactive=False), # [start_btn] Button + gr.update(interactive=True), # [clear_btn] Button + gr.update(interactive=True), # [image] Image + gr.update(interactive=True), # [upload_btn] Button + chat_state # [chat_state] State + ) + + def restart_chat(self, chat_state): + self.sess = None + self.context = "" + self.current_image_id = -1 + self.image_list = [] + self.pixel_values_list = [] + self.seen_image_idx = [] + + logging.info("=" * 30 + "End Chat" + "=" * 30) + + return ( + None, # [chatbot] Chatbot + #gr.update(value=None, interactive=True), # [image] Image + gr.update(interactive=False, placeholder="Please click the button to start chat!"), # [input_text] Textbox + gr.update(interactive=True), # [start] Button + gr.update(interactive=False), # [clear] Button + gr.update(value=None, interactive=False), # [image] Image + gr.update(interactive=False), # [upload_btn] Button + chat_state # [chat_state] State + ) + + def upload_image(self, image: Image.Image, chat_history: gr.Chatbot, chat_state: gr.State): + logging.info(f"type(image): {type(image)}") + + self.image_list.append(image) + save_image_path = os.path.join(self.uploaded_images_storage, "{}.jpg".format(len(os.listdir(self.uploaded_images_storage)))) + image.save(save_image_path) + logging.info(f"image save path: {save_image_path}") + chat_history.append((gr.HTML(f''), "Received.")) + + return None, chat_history, chat_state + + def respond( + self, + message, + image, + chat_history: gr.Chatbot, + top_p, + temperature, + chat_state, + ): + current_time = datetime.now().strftime("%b%d-%H:%M:%S") + logging.info(f"Time: {current_time}") + logging.info(f"User: {message}") + gen_config = GenerationConfig(top_p=top_p, temperature=temperature) + chat_input = message + if image is not None: + save_image_path = os.path.join(self.uploaded_images_storage, "{}.jpg".format(len(os.listdir(self.uploaded_images_storage)))) + image.save(save_image_path) + logging.info(f"image save path: {save_image_path}") + chat_input = (message, image) + if self.sess is None: + self.sess = self.pipe.chat(chat_input, gen_config=gen_config) + else: + self.sess = self.pipe.chat(chat_input, session=self.sess, gen_config=gen_config) + response = self.sess.response.text + if image is not None: + chat_history.append((gr.HTML(f'{message}\n\n'), response)) + else: + chat_history.append((message, response)) + + logging.info(f"generated text = \n{response}") + + return "", None, chat_history, chat_state diff --git a/demo/bot.png b/demo/bot.png new file mode 100644 index 0000000000000000000000000000000000000000..0047bf66e24ff259b7ea02081316c3d881854856 Binary files /dev/null and b/demo/bot.png differ diff --git a/demo/food/14456664_117_IMG_8176.jpeg b/demo/food/14456664_117_IMG_8176.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..de292c40ad1fe11f1418e5bd2ef861c691f844c6 --- /dev/null +++ b/demo/food/14456664_117_IMG_8176.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f844ef4ef78a325596a63ab702dafa0a67781348de8c12b58b9a9b655953ade1 +size 2635383 diff --git a/demo/food/14456664_126_86838F28-912B-42A4-80C1-BD060B649081.jpeg b/demo/food/14456664_126_86838F28-912B-42A4-80C1-BD060B649081.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..5e3594e979b48a58b7cae098dac2e71dbc242384 Binary files /dev/null and b/demo/food/14456664_126_86838F28-912B-42A4-80C1-BD060B649081.jpeg differ diff --git a/demo/food/14456664_127_1E703639-9169-4EB4-8A5D-3B1E226F639B.jpeg b/demo/food/14456664_127_1E703639-9169-4EB4-8A5D-3B1E226F639B.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..ffff39046fbd98dbaa9f09b0088b55986ab3a925 --- /dev/null +++ b/demo/food/14456664_127_1E703639-9169-4EB4-8A5D-3B1E226F639B.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad9fa9010a5731ac6c0870bf2a572c406f0954b7d3bf3b5ca1f63c78863e7762 +size 1895518 diff --git a/demo/food/14456664_129_IMG_20220605_181308.jpg b/demo/food/14456664_129_IMG_20220605_181308.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0c8f8ba7ac60e4a2b49da4dd4a101bf2053fd4ec Binary files /dev/null and b/demo/food/14456664_129_IMG_20220605_181308.jpg differ diff --git a/demo/food/14456664_130_IMG_20220605_180820.jpg b/demo/food/14456664_130_IMG_20220605_180820.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6a6a8915147f9f29c3d6fcd73d1dd0f2b45a274b Binary files /dev/null and b/demo/food/14456664_130_IMG_20220605_180820.jpg differ diff --git a/demo/food/14456664_133_IMG_20220709_153436.jpg b/demo/food/14456664_133_IMG_20220709_153436.jpg new file mode 100644 index 0000000000000000000000000000000000000000..320b6e6df4bacef7c823a7e79ff5aaafd59ce17d Binary files /dev/null and b/demo/food/14456664_133_IMG_20220709_153436.jpg differ diff --git a/demo/food/14456664_134_IMG_20220709_153652.jpg b/demo/food/14456664_134_IMG_20220709_153652.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f072514649b4220848caa470ad86dcd326ef3236 Binary files /dev/null and b/demo/food/14456664_134_IMG_20220709_153652.jpg differ diff --git a/demo/food/14456664_136_IMG_20220709_233905.jpg b/demo/food/14456664_136_IMG_20220709_233905.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a618cc6a12b745e5c6292762d37abdeb8c21080b Binary files /dev/null and b/demo/food/14456664_136_IMG_20220709_233905.jpg differ diff --git a/demo/food/14456664_137_IMG_20220717_143941.jpg b/demo/food/14456664_137_IMG_20220717_143941.jpg new file mode 100644 index 0000000000000000000000000000000000000000..827fa7df378da7cc39a54922e8aacfce4f7659ad Binary files /dev/null and b/demo/food/14456664_137_IMG_20220717_143941.jpg differ diff --git a/demo/food/14456664_139_IMG_0917-EDIT.jpg b/demo/food/14456664_139_IMG_0917-EDIT.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c9a3e89817881c16bafba80a90d10b5ec870e7c1 Binary files /dev/null and b/demo/food/14456664_139_IMG_0917-EDIT.jpg differ diff --git a/demo/food/14456664_141_IMG_1724.jpg b/demo/food/14456664_141_IMG_1724.jpg new file mode 100644 index 0000000000000000000000000000000000000000..03ec919973e3a183dfbd5657c2d54fb67adcf253 Binary files /dev/null and b/demo/food/14456664_141_IMG_1724.jpg differ diff --git a/demo/food/14456664_147_IMG_20190225_184723.jpg b/demo/food/14456664_147_IMG_20190225_184723.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77d69185b3b210127c1285c24251a5b39c1cfb86 Binary files /dev/null and b/demo/food/14456664_147_IMG_20190225_184723.jpg differ diff --git a/demo/food/14456664_149_IMG_20180812_182400.jpg b/demo/food/14456664_149_IMG_20180812_182400.jpg new file mode 100644 index 0000000000000000000000000000000000000000..59e0f26d05577d05a56648ad3abebcd3779982f8 Binary files /dev/null and b/demo/food/14456664_149_IMG_20180812_182400.jpg differ diff --git a/demo/food/14456664_14_IMG_20220721_150241.jpg b/demo/food/14456664_14_IMG_20220721_150241.jpg new file mode 100644 index 0000000000000000000000000000000000000000..130fc96287eae1a527f11020d826fa1be76f626f --- /dev/null +++ b/demo/food/14456664_14_IMG_20220721_150241.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df1c0a0fa513ed03b735318e47c7393a7761bf46a33ab60b9f9d3898355e71e2 +size 3953392 diff --git a/demo/food/14456664_150_IMG_20181218_190852.jpg b/demo/food/14456664_150_IMG_20181218_190852.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ffca3dc5d016449d3062330a270b6806b2633d78 Binary files /dev/null and b/demo/food/14456664_150_IMG_20181218_190852.jpg differ diff --git a/demo/food/14456664_153_IMG_20240411_102841.jpg b/demo/food/14456664_153_IMG_20240411_102841.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3fa1342aa1af42e7985b60de892e134fd242b6b4 --- /dev/null +++ b/demo/food/14456664_153_IMG_20240411_102841.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b151c68460ea7bc0fc85d91535a2bc94ed88dcf4d761046d2a37722c5bfa5f5 +size 1933043 diff --git a/demo/food/14456664_155_IMG_20240403_094547.jpg b/demo/food/14456664_155_IMG_20240403_094547.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fe25a403669ccc6b815a37459f8cc8ddea71e31a --- /dev/null +++ b/demo/food/14456664_155_IMG_20240403_094547.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65b3e7f66dbace96b3f790e17c38abedddfa07bf886e1d5c8159feb30352e5b6 +size 2268286 diff --git a/demo/food/14456664_158_IMG_20240416_141427.jpg b/demo/food/14456664_158_IMG_20240416_141427.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f0862f000294867e3af304f137601f1c68a42ac6 --- /dev/null +++ b/demo/food/14456664_158_IMG_20240416_141427.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f622069e0add72f4e778af7c223922a66dc1478ed13a1458dd3455ae3fe04495 +size 2899692 diff --git a/demo/food/14456664_159_IMG_20240416_140243.jpg b/demo/food/14456664_159_IMG_20240416_140243.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4d4dc6e831fe54d953ffeb3dc20cba2f0a1369c7 --- /dev/null +++ b/demo/food/14456664_159_IMG_20240416_140243.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce5bf8987b1bb99fb874a8ba8043952c2a03d6593d28447d44f09d4814402124 +size 2814714 diff --git a/demo/food/14456664_187_mmexport1713446177339.jpg b/demo/food/14456664_187_mmexport1713446177339.jpg new file mode 100644 index 0000000000000000000000000000000000000000..49532a08feab5ec18378e51b91592f6a0e73bc2d --- /dev/null +++ b/demo/food/14456664_187_mmexport1713446177339.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8fa2b9fe1ef98e53d61be59061e848e7338b09e5d3f8e01fb376b877698461b +size 1000537 diff --git a/demo/food/14456664_19_IMG_0395.jpeg b/demo/food/14456664_19_IMG_0395.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..4eb48c3f49895c57980e299fe48960cae56bf4a0 --- /dev/null +++ b/demo/food/14456664_19_IMG_0395.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3776a76f3612ab93123499a7fdb8bfabaad2596ee76eca754f4806fd404bc89 +size 3110275 diff --git a/demo/food/14456664_201_mmexport1713452408322.jpg b/demo/food/14456664_201_mmexport1713452408322.jpg new file mode 100644 index 0000000000000000000000000000000000000000..71fd4b2aa9151337d9f3a1c7b3fef4ef675192e8 Binary files /dev/null and b/demo/food/14456664_201_mmexport1713452408322.jpg differ diff --git a/demo/food/14456664_217_IMG_3854.jpeg b/demo/food/14456664_217_IMG_3854.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e8759885afa1ec305d77db047ebdf26d073f7f8c --- /dev/null +++ b/demo/food/14456664_217_IMG_3854.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6ee11d0137ab40d9c1bf743535cbd163e00c6de5fd69022febfd55a4abae610 +size 4983580 diff --git a/demo/food/14456664_22_IMG_20240404_135935.jpg b/demo/food/14456664_22_IMG_20240404_135935.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fdd5d509b54ae4f523ee60e8839112f193d1e881 --- /dev/null +++ b/demo/food/14456664_22_IMG_20240404_135935.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4f79f735255036164c0dac6d5cff946ef36a765eeed815992e437a1f48aa12b +size 3185124 diff --git a/demo/food/14456664_236_IMG_4742.jpeg b/demo/food/14456664_236_IMG_4742.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..91d26386b593e2e1db315740982f373f2bc7f115 --- /dev/null +++ b/demo/food/14456664_236_IMG_4742.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a25050eff6c82d7dac7cbd2fb7e37eec886867b512e6ea0390c40cf7ca961338 +size 2396007 diff --git a/demo/food/14456664_238_d370e16ff482d13a0718c6fb4cd7ffbb.jpeg b/demo/food/14456664_238_d370e16ff482d13a0718c6fb4cd7ffbb.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..478203da9ad6f307945aaa9f4c7056901dea80f5 Binary files /dev/null and b/demo/food/14456664_238_d370e16ff482d13a0718c6fb4cd7ffbb.jpeg differ diff --git a/demo/food/14456664_239_mmexport1713518196678.jpg b/demo/food/14456664_239_mmexport1713518196678.jpg new file mode 100644 index 0000000000000000000000000000000000000000..59b37f5f7c29081df92c0e5073c4da9422c02ee4 Binary files /dev/null and b/demo/food/14456664_239_mmexport1713518196678.jpg differ diff --git a/demo/food/14456664_26_IMG_3472.jpeg b/demo/food/14456664_26_IMG_3472.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..c4ea4015f4f601c0de23f3b7fa409d306d3dbd50 Binary files /dev/null and b/demo/food/14456664_26_IMG_3472.jpeg differ diff --git a/demo/food/14456664_34_IMG_0400.jpeg b/demo/food/14456664_34_IMG_0400.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..66a85ec7ba5d783d10bc272be2b173fe09f8a01b --- /dev/null +++ b/demo/food/14456664_34_IMG_0400.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd191af0a76621edeb10abe04cc952bb03c761b2abc733e5fe5474639d7b0e2 +size 1810607 diff --git a/demo/food/14456664_36_20240410160819.jpg b/demo/food/14456664_36_20240410160819.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e4814505fe0444c3d07bcd1ca40724d88e6dff2c Binary files /dev/null and b/demo/food/14456664_36_20240410160819.jpg differ diff --git a/demo/food/14456664_40_IMG_20170613_190017.jpg b/demo/food/14456664_40_IMG_20170613_190017.jpg new file mode 100644 index 0000000000000000000000000000000000000000..31f7238c996ac3605db56700f76688fc93f7bef4 --- /dev/null +++ b/demo/food/14456664_40_IMG_20170613_190017.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d813465ccf81c04c65d7dbab663b2948ac4298e31bda62d0bd0df5dcb90e539d +size 3052416 diff --git a/demo/food/14456664_41_IMG_20190216_124017.jpeg b/demo/food/14456664_41_IMG_20190216_124017.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..f61dbf0ec3ffeb64e0ef1bc33763574c085889a3 Binary files /dev/null and b/demo/food/14456664_41_IMG_20190216_124017.jpeg differ diff --git a/demo/food/14456664_50_mmexport1712821641752.jpg b/demo/food/14456664_50_mmexport1712821641752.jpg new file mode 100644 index 0000000000000000000000000000000000000000..ee1db803e7af7d6248f247ae1c5918db71f34ec5 Binary files /dev/null and b/demo/food/14456664_50_mmexport1712821641752.jpg differ diff --git a/demo/food/14456664_61_mmexport1550554859117.jpg b/demo/food/14456664_61_mmexport1550554859117.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4a2011c9bc4cc61dcadb87a67eea54877730fe7f --- /dev/null +++ b/demo/food/14456664_61_mmexport1550554859117.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95fbbf0cce24c3a8779a87216e4ece91af058b443129696405e7384705b9037 +size 2249334 diff --git a/demo/food/14456664_64_IMG_20240409_181047.jpg b/demo/food/14456664_64_IMG_20240409_181047.jpg new file mode 100644 index 0000000000000000000000000000000000000000..90be8705e480b374c0a3d50cc8656e7ab37cd25d --- /dev/null +++ b/demo/food/14456664_64_IMG_20240409_181047.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:070c656fa584ccf2331c37fa4e780381bab3b0d808aea54cba406fd66b7735bb +size 2675554 diff --git a/demo/food/14456664_68_IMG_0971.jpeg b/demo/food/14456664_68_IMG_0971.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e8e2d2dd8272a24cf577148fc82c3a847db9da50 --- /dev/null +++ b/demo/food/14456664_68_IMG_0971.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ae462fcb5b73a7c228ee775b08fe84827a383e9f663e2c31c1ef4c086a67c03 +size 2790379 diff --git a/demo/food/14456664_70_IMG_0581.jpeg b/demo/food/14456664_70_IMG_0581.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..a128a639b94a209d2f8fd1a1c3d9de518b3fb84d --- /dev/null +++ b/demo/food/14456664_70_IMG_0581.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05a32a6d8ce58c42a4ba4e852fabc9b326b1b66937ade06ed0251ea6408f4571 +size 1805093 diff --git a/demo/food/14456664_72_IMG_0618.jpeg b/demo/food/14456664_72_IMG_0618.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..4b85d120ff5ef180ad83d46280e7041f85c4234a --- /dev/null +++ b/demo/food/14456664_72_IMG_0618.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57b49b974c7b966ac2042e4efa506c5b55a9e6038ba7f1bb76434a97b5e06fe7 +size 2483224 diff --git a/demo/food/14456664_80_IMG_6323.jpeg b/demo/food/14456664_80_IMG_6323.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..cf7c31d713b41743d83d941c4ff26b4247821177 --- /dev/null +++ b/demo/food/14456664_80_IMG_6323.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a71fe70bfd223397ab7f0af42159ac501d787d99bbbbf61165b2f11e6f6ba0a +size 3666529 diff --git a/demo/food/14456664_82_MVIMG_20240413_161454.jpeg b/demo/food/14456664_82_MVIMG_20240413_161454.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..168dfa1334a8728ca08567613e9e9a9fc5a83c95 Binary files /dev/null and b/demo/food/14456664_82_MVIMG_20240413_161454.jpeg differ diff --git a/demo/food/14456664_87_IMG_5316.jpeg b/demo/food/14456664_87_IMG_5316.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..5524a01ce37ab92d2945066105e71c5aa8895ad3 --- /dev/null +++ b/demo/food/14456664_87_IMG_5316.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beadadd5d26749545b59159f8914f4a6ca9602f9a37c18f9ae6d578c62717eec +size 3141260 diff --git a/demo/food/14456664_88_wx_camera_1712730236546.jpg b/demo/food/14456664_88_wx_camera_1712730236546.jpg new file mode 100644 index 0000000000000000000000000000000000000000..aae1d469192ac7282236bf42a3b71591bae53771 --- /dev/null +++ b/demo/food/14456664_88_wx_camera_1712730236546.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc6b744e764770bff3cf73f7fb9bbe1424f88e59b4e22bb238c8765aec90483 +size 1288147 diff --git a/demo/food/14521898_108_IMG_6205.jpeg b/demo/food/14521898_108_IMG_6205.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1bf41fee7a750499919128ef899e474d14686f38 --- /dev/null +++ b/demo/food/14521898_108_IMG_6205.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:575cc946cb96af9bf5bece1fcda3eaf6a17dab7ba28f127d0573b068cfd4d9c0 +size 2624459 diff --git a/demo/food/14521898_109_IMG_5997.jpeg b/demo/food/14521898_109_IMG_5997.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..b6dbdbc0cf4a10ee5bb59140e3f72d142682b5f8 --- /dev/null +++ b/demo/food/14521898_109_IMG_5997.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba100e072a5fc01aa755087c2f3a5edc6e7e622fc7f6d3ba5b40f92b7e1476f3 +size 3535453 diff --git a/demo/food/14521898_112_IMG_3570.jpeg b/demo/food/14521898_112_IMG_3570.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..f5aa50978211e3c00399b2aaf0d1f32c7d9fddcb --- /dev/null +++ b/demo/food/14521898_112_IMG_3570.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac8006a805982d44946da2574bdf9fff99fa1d592aa5e6eb86dfd9fd1b68f1a5 +size 2764359 diff --git a/demo/food/14521898_113_IMG_1724.jpeg b/demo/food/14521898_113_IMG_1724.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..98f9b594e2aa59ec8825b4dd6bdd382fb281f813 Binary files /dev/null and b/demo/food/14521898_113_IMG_1724.jpeg differ diff --git a/demo/food/14521898_117_IMG_1362.jpeg b/demo/food/14521898_117_IMG_1362.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..98b9674a0f769bd7dadeb5fe4690823f2e846fea --- /dev/null +++ b/demo/food/14521898_117_IMG_1362.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c382431d796f1c21f35ead732e20f54398ad4f7ce218bc2137fbe1450684b9d4 +size 2310789 diff --git a/demo/food/14521898_122_53D0E884-E012-4FFA-8BD7-8F5666A53123.jpeg b/demo/food/14521898_122_53D0E884-E012-4FFA-8BD7-8F5666A53123.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..0db0c40a5f643b0c021e7df4d150be8147aff857 --- /dev/null +++ b/demo/food/14521898_122_53D0E884-E012-4FFA-8BD7-8F5666A53123.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d75a4e7cc4276874b3cf31ad6322e56f58b7cb0a0264eb44440d8e241848052 +size 1506122 diff --git a/demo/food/14521898_12_fb48488412a1846c104b28600f4f1ded.jpeg b/demo/food/14521898_12_fb48488412a1846c104b28600f4f1ded.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..bed372e6f803ffa9ab70fd9cec77b9b6428fa223 Binary files /dev/null and b/demo/food/14521898_12_fb48488412a1846c104b28600f4f1ded.jpeg differ diff --git a/demo/food/14521898_144_IMG_8800.jpeg b/demo/food/14521898_144_IMG_8800.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..0afca15d01662415a2fe62565684f6547aed8f98 --- /dev/null +++ b/demo/food/14521898_144_IMG_8800.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75c9bc5a693419c74183996df82f3185f161becf4190a2c5beb0e4181707f104 +size 1919833 diff --git a/demo/food/14521898_14_IMG_3852.jpeg b/demo/food/14521898_14_IMG_3852.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e30fa7c33761df4546eb1077f727f0ba019fed87 --- /dev/null +++ b/demo/food/14521898_14_IMG_3852.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fb9d439c333f46cf5c7c3867ff771adce427ca8abd0625287a9edc409cd62aa +size 4538255 diff --git a/demo/food/14521898_162_IMG_0356.jpeg b/demo/food/14521898_162_IMG_0356.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..ea2f5fd218ee3b4fd9504969463d511c6c11e0ca --- /dev/null +++ b/demo/food/14521898_162_IMG_0356.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cba6d96455c96da2f10d03f52f23e404262aaa61d136de2d425e687b9e01b91 +size 3334867 diff --git a/demo/food/14521898_168_D4D36C59-2D68-4354-B05C-B3065F6581AA.jpeg b/demo/food/14521898_168_D4D36C59-2D68-4354-B05C-B3065F6581AA.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..f7150719f2e414c025c0cb2a7be8225608fa063e --- /dev/null +++ b/demo/food/14521898_168_D4D36C59-2D68-4354-B05C-B3065F6581AA.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09fa6ade4b0b77977236a3baf113ef1d5c2d294cf1a43a236fa55ed7f891814 +size 3275533 diff --git a/demo/food/14521898_174_IMG_8019.jpeg b/demo/food/14521898_174_IMG_8019.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..c51d857be8b7c72b60ffee8be64dd90e9476e0ce --- /dev/null +++ b/demo/food/14521898_174_IMG_8019.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c1d1a65a4e75ee47f34dfd8660d98a6ab323af9ddd5b9fe0778a391165923b +size 3196281 diff --git a/demo/food/14521898_183_IMG_20240502_122921.jpg b/demo/food/14521898_183_IMG_20240502_122921.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4c941b911e1900b4c243102f758b508d7eb102cc --- /dev/null +++ b/demo/food/14521898_183_IMG_20240502_122921.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fde07270ec4b4d1efa77e2810151d3e26adc7c3724c517261566c746e914ac3 +size 3151183 diff --git a/demo/food/14521898_193_IMG_2435.jpeg b/demo/food/14521898_193_IMG_2435.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..22bebc3e7e980b12cafdcba34819138771d46391 Binary files /dev/null and b/demo/food/14521898_193_IMG_2435.jpeg differ diff --git a/demo/food/14521898_209_IMG_3105.jpeg b/demo/food/14521898_209_IMG_3105.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..72ec67bf234abbcd0539d61aa13ef0f51be84140 --- /dev/null +++ b/demo/food/14521898_209_IMG_3105.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ce4a99b2b895aa5a062785498f84cde41008a64c5ad6ac2933c6134e9ef112f +size 2159154 diff --git a/demo/food/14521898_211_IMG_9139.jpeg b/demo/food/14521898_211_IMG_9139.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..7194b53d26c2345ec1c003dcebe57e575f0996e3 --- /dev/null +++ b/demo/food/14521898_211_IMG_9139.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2cbfaba62677f81d6d04c5d1e429986a7efcfc68d9603de1ec7c5e76022061 +size 3228111 diff --git a/demo/food/14521898_237_IMG_5909.jpeg b/demo/food/14521898_237_IMG_5909.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..fe7f9ac4621ac8f9c9995ea91ca1a6077d52392b --- /dev/null +++ b/demo/food/14521898_237_IMG_5909.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a8ec84b640965f6f69b6e83dd2f8d57f34d2777b9f26d335b7afe4bd4fa1fc +size 2150570 diff --git a/demo/food/14521898_248_D8768DE2-559E-4850-A95E-07D5539F06C1.jpeg b/demo/food/14521898_248_D8768DE2-559E-4850-A95E-07D5539F06C1.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e712a0d971b5125b4e2d3882d0c30cafbec5104e Binary files /dev/null and b/demo/food/14521898_248_D8768DE2-559E-4850-A95E-07D5539F06C1.jpeg differ diff --git a/demo/food/14521898_249_0755477B-B9D5-4786-8D86-BFD4AC9478FD.jpeg b/demo/food/14521898_249_0755477B-B9D5-4786-8D86-BFD4AC9478FD.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e0134eeb3a3d4afafd2931cc08d4ff7cc2abcddd Binary files /dev/null and b/demo/food/14521898_249_0755477B-B9D5-4786-8D86-BFD4AC9478FD.jpeg differ diff --git a/demo/food/14521898_257_79874AA6-4F23-4AD6-96E2-7F64039A81C0.jpeg b/demo/food/14521898_257_79874AA6-4F23-4AD6-96E2-7F64039A81C0.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..497fd3cf5fe1d6ab5d014f6fb4bac9a0ed8e4343 --- /dev/null +++ b/demo/food/14521898_257_79874AA6-4F23-4AD6-96E2-7F64039A81C0.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01d68d9cfd915c5c4d8c366794b1cd63b9097c0a326eb90ea841c97bec6172a3 +size 2787609 diff --git a/demo/food/14521898_261_0E3192C7-1B0D-4C4D-9788-5E7A5E6E92BA.jpeg b/demo/food/14521898_261_0E3192C7-1B0D-4C4D-9788-5E7A5E6E92BA.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..2735f1bf18cc19073dd4fa1f267f1392f023ceda --- /dev/null +++ b/demo/food/14521898_261_0E3192C7-1B0D-4C4D-9788-5E7A5E6E92BA.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:893a879bcd7b569c272020a729c17af77389f80ff4193c259d83e32c422f76c6 +size 3240469 diff --git a/demo/food/14521898_262_75B3AEDF-C705-400B-AE86-3FA7A92B624C.jpeg b/demo/food/14521898_262_75B3AEDF-C705-400B-AE86-3FA7A92B624C.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..a4fc6ff0b10ac581376e83b14af04e3629efa943 --- /dev/null +++ b/demo/food/14521898_262_75B3AEDF-C705-400B-AE86-3FA7A92B624C.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe339cb0972c21f4e0bf79623f69ad7bdfdc732f0af955e52d5fb38202404be3 +size 3425619 diff --git a/demo/food/14521898_265_1C4A3265-A18E-46C2-A264-4384B85A49C0.jpeg b/demo/food/14521898_265_1C4A3265-A18E-46C2-A264-4384B85A49C0.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..a644bc048d69116d32c552751ddebe588a63e978 --- /dev/null +++ b/demo/food/14521898_265_1C4A3265-A18E-46C2-A264-4384B85A49C0.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:214bba49fb123d1d20897ee9e5391b89f633af14e3b5ee84146c1f6644ac1eec +size 3539520 diff --git a/demo/food/14521898_35_u_23162001_2841880362fm_170s_80187E975C411EC042A.png b/demo/food/14521898_35_u_23162001_2841880362fm_170s_80187E975C411EC042A.png new file mode 100644 index 0000000000000000000000000000000000000000..a9bd456d7a605366644e9617bc4837ede224c674 Binary files /dev/null and b/demo/food/14521898_35_u_23162001_2841880362fm_170s_80187E975C411EC042A.png differ diff --git a/demo/food/14521898_44_930F3702-FC28-498F-A5A7-9E01AA5AEA15.jpeg b/demo/food/14521898_44_930F3702-FC28-498F-A5A7-9E01AA5AEA15.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..474561d61c2c035567155801c1d7e125f21b3aa7 --- /dev/null +++ b/demo/food/14521898_44_930F3702-FC28-498F-A5A7-9E01AA5AEA15.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93100efa33f3122c6c419ab1fc89d1da2019f556f91a7285bd0fd0e5752b8fc5 +size 3438029 diff --git a/demo/food/14521898_55_IMG_8533.jpeg b/demo/food/14521898_55_IMG_8533.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..01ac1bd4eefaa026cf0f2cd3b0e2dc164f0b1b69 --- /dev/null +++ b/demo/food/14521898_55_IMG_8533.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a56308a61e73be157a4d5a4c6ebb8b8a2621bf0340a648b626d5a86765fa511 +size 3021757 diff --git a/demo/food/14521898_65_IMG_20240424_121225.jpg b/demo/food/14521898_65_IMG_20240424_121225.jpg new file mode 100644 index 0000000000000000000000000000000000000000..fea5ceee31c4f933caff3916adfaf9e2c79cbd22 --- /dev/null +++ b/demo/food/14521898_65_IMG_20240424_121225.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eaf0a61fc3dc0af6f73885322d220ee483ff9b9f21c3b190f25188ed741a11a +size 3669645 diff --git a/demo/food/14521898_75_IMG_8637.jpeg b/demo/food/14521898_75_IMG_8637.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..124aec8e25a80d2524da3a4fe3530dffcf58fae3 --- /dev/null +++ b/demo/food/14521898_75_IMG_8637.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c3231243f499daaa33d7978223e1cf1aedd04d6fa4427703cd4b7cbc1568459 +size 3191491 diff --git a/demo/food/14521898_79_IMG_2612.jpeg b/demo/food/14521898_79_IMG_2612.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..0798b01b044c6b9bfc23c68c339d42ae6ffedc40 --- /dev/null +++ b/demo/food/14521898_79_IMG_2612.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcd4f8782b814f29648572c60079966f9c71bd9524238d1ca5335536f3ec58d1 +size 2298822 diff --git a/demo/food/14521898_80_IMG_2613.jpeg b/demo/food/14521898_80_IMG_2613.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..84eb662889cd1807fb258c60ebdab67417110998 --- /dev/null +++ b/demo/food/14521898_80_IMG_2613.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea8eae0059f847b151cd9fbef328910a8391b5f27b78c8dd362fd915129a6c08 +size 2794108 diff --git a/demo/food/14521898_82_IMG_9847.jpeg b/demo/food/14521898_82_IMG_9847.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..1a12a2bd9b84990e2f1d60d183b3996bbbd2fdb5 --- /dev/null +++ b/demo/food/14521898_82_IMG_9847.jpeg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86cd59efee342be115a7347982362d7aa6d4180dc80c5a3e10cd70ce026b2fc0 +size 3177517 diff --git a/demo/food/14521898_93_Image_1713865938858.jpg b/demo/food/14521898_93_Image_1713865938858.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c7d4d27943c73621cbe172f01ceb3b9a13607a48 Binary files /dev/null and b/demo/food/14521898_93_Image_1713865938858.jpg differ diff --git a/demo/food/14521898_97_Image_1713865926721.jpg b/demo/food/14521898_97_Image_1713865926721.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6f9bce54ebd9e049b36cd37a4818e4ccf3c34cf0 Binary files /dev/null and b/demo/food/14521898_97_Image_1713865926721.jpg differ diff --git a/demo/food_for_demo.json b/demo/food_for_demo.json new file mode 100644 index 0000000000000000000000000000000000000000..fbe0700703bf12c505d61f070dea629c1b823087 --- /dev/null +++ b/demo/food_for_demo.json @@ -0,0 +1,97 @@ +{ + "新疆菜": [ + "demo/food/14521898_75_IMG_8637.jpeg", + "demo/food/14456664_139_IMG_0917-EDIT.jpg", + "demo/food/14456664_80_IMG_6323.jpeg", + "demo/food/14456664_217_IMG_3854.jpeg", + "demo/food/14521898_14_IMG_3852.jpeg", + "demo/food/14456664_136_IMG_20220709_233905.jpg", + "demo/food/14456664_137_IMG_20220717_143941.jpg", + "demo/food/14456664_141_IMG_1724.jpg", + "demo/food/14456664_134_IMG_20220709_153652.jpg", + "demo/food/14456664_133_IMG_20220709_153436.jpg" + ], + "川菜(四川,重庆)": [ + "demo/food/14521898_113_IMG_1724.jpeg", + "demo/food/14521898_193_IMG_2435.jpeg", + "demo/food/14521898_44_930F3702-FC28-498F-A5A7-9E01AA5AEA15.jpeg", + "demo/food/14521898_80_IMG_2613.jpeg", + "demo/food/14456664_68_IMG_0971.jpeg", + "demo/food/14521898_174_IMG_8019.jpeg", + "demo/food/14456664_88_wx_camera_1712730236546.jpg", + "demo/food/14456664_36_20240410160819.jpg", + "demo/food/14456664_238_d370e16ff482d13a0718c6fb4cd7ffbb.jpeg", + "demo/food/14521898_183_IMG_20240502_122921.jpg", + "demo/food/14456664_40_IMG_20170613_190017.jpg", + "demo/food/14456664_150_IMG_20181218_190852.jpg", + "demo/food/14456664_72_IMG_0618.jpeg", + "demo/food/14521898_82_IMG_9847.jpeg" + ], + "西北菜 (陕西,甘肃等地)": [ + "demo/food/14456664_82_MVIMG_20240413_161454.jpeg", + "demo/food/14521898_261_0E3192C7-1B0D-4C4D-9788-5E7A5E6E92BA.jpeg", + "demo/food/14456664_50_mmexport1712821641752.jpg", + "demo/food/14456664_126_86838F28-912B-42A4-80C1-BD060B649081.jpeg", + "demo/food/14521898_262_75B3AEDF-C705-400B-AE86-3FA7A92B624C.jpeg", + "demo/food/14456664_22_IMG_20240404_135935.jpg", + "demo/food/14521898_265_1C4A3265-A18E-46C2-A264-4384B85A49C0.jpeg", + "demo/food/14456664_127_1E703639-9169-4EB4-8A5D-3B1E226F639B.jpeg" + ], + "黔菜 (贵州)": [ + "demo/food/14521898_35_u_23162001_2841880362fm_170s_80187E975C411EC042A.png", + "demo/food/14456664_61_mmexport1550554859117.jpg", + "demo/food/14456664_201_mmexport1713452408322.jpg", + "demo/food/14521898_211_IMG_9139.jpeg", + "demo/food/14456664_187_mmexport1713446177339.jpg", + "demo/food/14521898_97_Image_1713865926721.jpg", + "demo/food/14456664_41_IMG_20190216_124017.jpeg", + "demo/food/14521898_93_Image_1713865938858.jpg" + ], + "苏菜(江苏)": [ + "demo/food/14456664_64_IMG_20240409_181047.jpg", + "demo/food/14456664_149_IMG_20180812_182400.jpg", + "demo/food/14521898_65_IMG_20240424_121225.jpg", + "demo/food/14456664_236_IMG_4742.jpeg", + "demo/food/14521898_55_IMG_8533.jpeg", + "demo/food/14521898_109_IMG_5997.jpeg", + "demo/food/14521898_79_IMG_2612.jpeg", + "demo/food/14456664_34_IMG_0400.jpeg", + "demo/food/14521898_248_D8768DE2-559E-4850-A95E-07D5539F06C1.jpeg" + ], + "粤菜(广东等地)": [ + "demo/food/14521898_257_79874AA6-4F23-4AD6-96E2-7F64039A81C0.jpeg", + "demo/food/14521898_144_IMG_8800.jpeg", + "demo/food/14456664_155_IMG_20240403_094547.jpg", + "demo/food/14456664_87_IMG_5316.jpeg", + "demo/food/14521898_249_0755477B-B9D5-4786-8D86-BFD4AC9478FD.jpeg", + "demo/food/14521898_112_IMG_3570.jpeg", + "demo/food/14456664_153_IMG_20240411_102841.jpg", + "demo/food/14456664_70_IMG_0581.jpeg", + "demo/food/14521898_162_IMG_0356.jpeg", + "demo/food/14521898_117_IMG_1362.jpeg" + ], + "湘菜(湖南)": [ + "demo/food/14521898_12_fb48488412a1846c104b28600f4f1ded.jpeg", + "demo/food/14456664_147_IMG_20190225_184723.jpg", + "demo/food/14521898_209_IMG_3105.jpeg" + ], + "闽菜(福建)": [ + "demo/food/14521898_108_IMG_6205.jpeg", + "demo/food/14456664_129_IMG_20220605_181308.jpg", + "demo/food/14521898_168_D4D36C59-2D68-4354-B05C-B3065F6581AA.jpeg" + ], + "浙菜(浙江)": [ + "demo/food/14456664_159_IMG_20240416_140243.jpg", + "demo/food/14456664_239_mmexport1713518196678.jpg", + "demo/food/14456664_130_IMG_20220605_180820.jpg", + "demo/food/14456664_117_IMG_8176.jpeg", + "demo/food/14456664_19_IMG_0395.jpeg", + "demo/food/14456664_158_IMG_20240416_141427.jpg", + "demo/food/14521898_122_53D0E884-E012-4FFA-8BD7-8F5666A53123.jpeg" + ], + "东北菜 (黑龙江等地)": [ + "demo/food/14456664_26_IMG_3472.jpeg", + "demo/food/14521898_237_IMG_5909.jpeg", + "demo/food/14456664_14_IMG_20220721_150241.jpg" + ] +} \ No newline at end of file diff --git a/demo/theme.py b/demo/theme.py new file mode 100644 index 0000000000000000000000000000000000000000..3530a4b7e9427541e4613d83b16db0bef0414ca3 --- /dev/null +++ b/demo/theme.py @@ -0,0 +1,53 @@ +from gradio.themes.base import Base +from gradio.themes.utils import colors, fonts, sizes +from typing import Union, Iterable + + +class CustomTheme(Base): + def __init__( + self, + primary_hue: Union[colors.Color, str] = colors.emerald, + secondary_hue: Union[colors.Color, str] = colors.blue, + neutral_hue: Union[colors.Color, str] = colors.slate, + spacing_size: Union[sizes.Size, str] = sizes.spacing_md, + radius_size: Union[sizes.Size, str] = sizes.radius_md, + text_size: Union[sizes.Size, str] = sizes.text_lg, + font: Union[fonts.Font, str, Iterable[Union[fonts.Font, str]]] = ( + fonts.GoogleFont("Alice"), + "ui-sans-serif", + "sans-serif", + ), + font_mono: Union[fonts.Font, str, Iterable[Union[fonts.Font, str]]] = ( + fonts.GoogleFont("Merriweather"), + "ui-monospace", + "monospace", + ), + ): + super().__init__( + primary_hue=primary_hue, + secondary_hue=secondary_hue, + neutral_hue=neutral_hue, + spacing_size=spacing_size, + radius_size=radius_size, + text_size=text_size, + font=font, + font_mono=font_mono, + ) + super().set( + body_background_fill="#ECF2F7", + body_background_fill_dark="#191919", + button_primary_background_fill="linear-gradient(90deg, *primary_300, *secondary_400)", + button_primary_background_fill_hover="*primary_700", + button_primary_text_color="white", + button_primary_background_fill_dark="linear-gradient(90deg, *primary_600, *secondary_800)", + slider_color="#4EACEF", + slider_color_dark="#4EACEF", + block_title_text_weight="600", + block_title_text_size="*text_md", + block_label_text_weight="600", + block_label_text_size="*text_md", + block_border_width="1px", + block_shadow="#FFFFFF00", + button_shadow="*shadow_drop_lg", + button_large_padding="*spacing_lg calc(2 * *spacing_lg)", + ) \ No newline at end of file diff --git a/demo/user.png b/demo/user.png new file mode 100644 index 0000000000000000000000000000000000000000..5dfdd89145ba278a582e454d011b524c2696e0ab Binary files /dev/null and b/demo/user.png differ diff --git a/outputs/logs/Dec24_14-16-43.txt b/outputs/logs/Dec24_14-16-43.txt new file mode 100644 index 0000000000000000000000000000000000000000..32dd7081712206238ab228ce37970d6293c16d7b --- /dev/null +++ b/outputs/logs/Dec24_14-16-43.txt @@ -0,0 +1,59 @@ +2024-12-24 14:16:44,738 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:16:44,738 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:16:44,738 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:16:44,738 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:23,159 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:23,161 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:23,161 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:23,161 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:23,191 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:23,191 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:23,191 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:23,191 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:24,253 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:24,254 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:24,254 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:24,254 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:42,809 modeling_internvl_chat.py[line:54] INFO || num_image_token: 256 +2024-12-24 14:17:42,812 modeling_internvl_chat.py[line:55] INFO || ps_version: v2 +2024-12-24 14:17:49,038 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:49,038 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:49,038 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:49,038 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:49,666 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:49,666 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:49,666 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:49,666 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:52,677 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:52,677 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:52,677 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:52,677 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:52,705 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:52,705 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:52,705 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:52,705 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:52,732 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:52,732 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:52,732 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:52,732 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:52,755 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:52,755 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:52,755 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:52,756 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:52,780 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:52,780 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:52,780 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:52,780 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:52,806 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:52,806 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:52,806 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:52,806 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:58,867 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:17:58,867 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:17:58,867 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:17:58,867 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:17:59,754 _client.py[line:1025] INFO || HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 " +2024-12-24 14:18:01,647 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK" +2024-12-24 14:18:04,490 _client.py[line:1025] INFO || HTTP Request: GET http://127.0.0.1:1096/startup-events "HTTP/1.1 200 OK" +2024-12-24 14:18:04,754 _client.py[line:1025] INFO || HTTP Request: HEAD http://127.0.0.1:1096/ "HTTP/1.1 200 OK" +2024-12-24 14:18:06,277 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/v2/tunnel-request "HTTP/1.1 200 OK" diff --git a/outputs/logs/Dec24_14-48-37.txt b/outputs/logs/Dec24_14-48-37.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a25cd9a54f8f8f0b12b8f6968d67c768b290802 --- /dev/null +++ b/outputs/logs/Dec24_14-48-37.txt @@ -0,0 +1,73 @@ +2024-12-24 14:48:37,587 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:48:37,587 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:48:37,587 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:48:37,588 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:02,486 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:02,488 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:02,489 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:02,489 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:02,510 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:02,511 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:02,511 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:02,511 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:03,245 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:03,245 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:03,245 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:03,245 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:11,789 modeling_internvl_chat.py[line:54] INFO || num_image_token: 256 +2024-12-24 14:49:11,789 modeling_internvl_chat.py[line:55] INFO || ps_version: v2 +2024-12-24 14:49:16,358 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:16,362 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:16,362 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:16,362 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:16,753 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:16,753 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:16,753 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:16,753 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:17,524 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:17,524 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:17,524 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:17,524 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:17,557 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:17,557 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:17,557 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:17,557 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:17,618 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:17,618 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:17,618 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:17,618 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:17,647 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:17,647 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:17,647 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:17,647 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:17,687 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:17,687 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:17,687 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:17,687 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:17,724 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:17,724 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:17,724 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:17,724 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:20,526 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-24 14:49:20,526 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-24 14:49:20,526 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-24 14:49:20,526 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-24 14:49:21,062 _client.py[line:1025] INFO || HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 " +2024-12-24 14:49:22,650 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK" +2024-12-24 14:49:23,958 _client.py[line:1025] INFO || HTTP Request: GET http://127.0.0.1:1096/startup-events "HTTP/1.1 200 OK" +2024-12-24 14:49:24,132 _client.py[line:1025] INFO || HTTP Request: HEAD http://127.0.0.1:1096/ "HTTP/1.1 200 OK" +2024-12-24 14:49:26,524 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/v2/tunnel-request "HTTP/1.1 200 OK" +2024-12-24 14:51:03,755 agent.py[line:30] INFO || ==============================Start Chat============================== +2024-12-24 15:17:07,989 agent.py[line:84] INFO || Time: Dec24-15:17:07 +2024-12-24 15:17:07,993 agent.py[line:85] INFO || User: 这是什么东西 +2024-12-24 15:17:08,082 agent.py[line:91] INFO || image save path: /root/InternVL2-Tutorial/outputs/uploaded/0.jpg +2024-12-24 15:17:22,815 agent.py[line:103] INFO || generated text = +这是一只机器人。从外观上看,它具有一个圆圆的头,两个大大的眼睛,以及一个显示屏。显示屏上似乎显示了一些文字和数据。机器人的设计风格现代,看起来像是一个智能助手或娱乐机器人。 +2024-12-24 15:22:16,494 agent.py[line:64] INFO || type(image): +2024-12-24 15:22:16,611 agent.py[line:69] INFO || image save path: /root/InternVL2-Tutorial/outputs/uploaded/1.jpg +2024-12-24 15:22:28,851 agent.py[line:84] INFO || Time: Dec24-15:22:28 +2024-12-24 15:22:28,851 agent.py[line:85] INFO || User: 这张图表达了什么 +2024-12-24 15:22:29,573 agent.py[line:103] INFO || generated text = +这张图展示了一只机器人,它具有一个圆圆的头,两个大大的眼睛,以及一个显示屏。显示屏上显示了一些文字和数据。从整体设计来看,这只机器人看起来像是一个智能助手或娱乐机器人。 + +这张图可能表达了科技与人类之间的互动,以及机器人在现代生活中的应用。它也可能象征着未来科技的进步和机器人技术的发展。 diff --git a/outputs/logs/Dec25_10-59-44.txt b/outputs/logs/Dec25_10-59-44.txt new file mode 100644 index 0000000000000000000000000000000000000000..e51ba37e5c2120bfde527edb67b13c5849492ab7 --- /dev/null +++ b/outputs/logs/Dec25_10-59-44.txt @@ -0,0 +1,73 @@ +2024-12-25 10:59:44,773 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 10:59:44,773 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 10:59:44,773 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 10:59:44,780 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:27,085 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:27,087 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:27,087 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:27,087 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:27,103 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:27,103 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:27,103 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:27,110 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:28,351 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:28,351 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:28,351 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:28,351 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:48,563 modeling_internvl_chat.py[line:54] INFO || num_image_token: 256 +2024-12-25 11:00:48,564 modeling_internvl_chat.py[line:55] INFO || ps_version: v2 +2024-12-25 11:00:54,456 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:54,456 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:54,456 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:54,456 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:54,935 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:54,935 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:54,935 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:54,935 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:58,076 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:58,076 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:58,076 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:58,076 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:58,100 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:58,100 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:58,100 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:58,108 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:58,133 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:58,133 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:58,133 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:58,139 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:58,156 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:58,156 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:58,156 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:58,165 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:58,188 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:58,188 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:58,188 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:58,194 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:00:58,223 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:00:58,223 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:00:58,223 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:00:58,227 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:01:02,859 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-25 11:01:02,859 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-25 11:01:02,859 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-25 11:01:02,865 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-25 11:01:03,631 _client.py[line:1025] INFO || HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 " +2024-12-25 11:01:04,114 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK" +2024-12-25 11:01:11,121 _client.py[line:1025] INFO || HTTP Request: GET http://127.0.0.1:1096/startup-events "HTTP/1.1 200 OK" +2024-12-25 11:01:11,508 _client.py[line:1025] INFO || HTTP Request: HEAD http://127.0.0.1:1096/ "HTTP/1.1 200 OK" +2024-12-25 11:01:12,399 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/v2/tunnel-request "HTTP/1.1 200 OK" +2024-12-25 11:04:58,154 agent.py[line:30] INFO || ==============================Start Chat============================== +2024-12-25 11:05:12,324 agent.py[line:84] INFO || Time: Dec25-11:05:12 +2024-12-25 11:05:12,324 agent.py[line:85] INFO || User: 图片中的食物通常属于哪个菜系? +2024-12-25 11:05:12,498 agent.py[line:91] INFO || image save path: /root/InternVL2-Tutorial/outputs/uploaded/2.jpg +2024-12-25 11:05:27,472 agent.py[line:103] INFO || generated text = +图片中的食物看起来像是饺子,具体来说,是饺子的一种,可能是煎饺或蒸饺。饺子是中国传统的食品,常见于各种菜系中,包括北方菜系、粤菜系和川菜系。 + +饺子在中国的许多地方都有不同的做法和口味,例如: + +1. **北方菜系**:饺子通常是用面皮包裹肉馅或素馅,然后煎或蒸。 +2. **粤菜系**:粤菜饺子通常使用薄皮,馅料丰富,如虾仁、猪肉、牛肉、蔬菜等。 +3. **川菜系**:川菜饺子通常使用面皮,馅料多样,如猪肉、牛肉、鸡肉、蔬菜等,有时还会加入花椒等调料。 + +根据图片中的饺子外观和馅料,可以推测它可能是粤菜或川菜饺子。 diff --git a/outputs/logs/Dec26_15-43-14.txt b/outputs/logs/Dec26_15-43-14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc663f03451a4f52f5583c5edcc3cbde2b62b3c6 --- /dev/null +++ b/outputs/logs/Dec26_15-43-14.txt @@ -0,0 +1,70 @@ +2024-12-26 15:43:15,885 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:43:15,885 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:43:15,885 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:43:15,885 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:44:02,921 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:44:02,923 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:44:02,923 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:44:02,923 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:44:02,951 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:44:02,951 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:44:02,951 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:44:02,951 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:44:03,658 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:44:03,658 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:44:03,658 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:44:03,658 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:44:27,055 modeling_internvl_chat.py[line:54] INFO || num_image_token: 256 +2024-12-26 15:44:27,061 modeling_internvl_chat.py[line:55] INFO || ps_version: v2 +2024-12-26 15:46:21,687 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:21,691 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:21,691 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:21,691 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:22,318 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:22,318 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:22,318 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:22,318 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:25,794 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:25,794 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:25,794 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:25,794 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:25,838 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:25,838 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:25,838 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:25,838 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:25,883 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:25,883 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:25,883 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:25,883 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:25,934 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:25,934 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:25,934 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:25,934 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:25,986 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:25,986 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:25,987 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:25,987 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:26,028 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:26,028 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:26,028 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:26,028 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:29,779 configuration_internvl_chat.py[line:68] INFO || vision_select_layer: -1 +2024-12-26 15:46:29,779 configuration_internvl_chat.py[line:69] INFO || ps_version: v2 +2024-12-26 15:46:29,779 configuration_internvl_chat.py[line:70] INFO || min_dynamic_patch: 1 +2024-12-26 15:46:29,779 configuration_internvl_chat.py[line:71] INFO || max_dynamic_patch: 12 +2024-12-26 15:46:30,347 _client.py[line:1025] INFO || HTTP Request: GET https://checkip.amazonaws.com/ "HTTP/1.1 200 " +2024-12-26 15:46:30,835 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/pkg-version "HTTP/1.1 200 OK" +2024-12-26 15:46:38,907 _client.py[line:1025] INFO || HTTP Request: GET http://127.0.0.1:1096/startup-events "HTTP/1.1 200 OK" +2024-12-26 15:46:39,254 _client.py[line:1025] INFO || HTTP Request: HEAD http://127.0.0.1:1096/ "HTTP/1.1 200 OK" +2024-12-26 15:46:40,102 _client.py[line:1025] INFO || HTTP Request: GET https://api.gradio.app/v2/tunnel-request "HTTP/1.1 200 OK" +2024-12-26 15:53:18,192 agent.py[line:30] INFO || ==============================Start Chat============================== +2024-12-26 15:53:23,849 agent.py[line:84] INFO || Time: Dec26-15:53:23 +2024-12-26 15:53:23,849 agent.py[line:85] INFO || User: 图片中的食物通常属于哪个菜系? +2024-12-26 15:53:23,959 agent.py[line:91] INFO || image save path: /root/InternVL2-Tutorial/outputs/uploaded/3.jpg +2024-12-26 15:53:39,981 agent.py[line:103] INFO || generated text = +粤菜,图中的菜是鸡蛋肠粉 +2024-12-26 15:58:07,528 agent.py[line:84] INFO || Time: Dec26-15:58:07 +2024-12-26 15:58:07,533 agent.py[line:85] INFO || User: 这是什么菜 +2024-12-26 15:58:07,672 agent.py[line:91] INFO || image save path: /root/InternVL2-Tutorial/outputs/uploaded/4.jpg +2024-12-26 15:58:08,544 agent.py[line:103] INFO || generated text = +东北,图中的菜是锅包肉 diff --git a/outputs/uploaded/0.jpg b/outputs/uploaded/0.jpg new file mode 100644 index 0000000000000000000000000000000000000000..00a8702321b9ed3175cd0938a4c9680a3b46b265 Binary files /dev/null and b/outputs/uploaded/0.jpg differ diff --git a/outputs/uploaded/1.jpg b/outputs/uploaded/1.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a5f2993942557c7b2f3b56dd7682951bc4e84739 Binary files /dev/null and b/outputs/uploaded/1.jpg differ diff --git a/outputs/uploaded/2.jpg b/outputs/uploaded/2.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c2c90342402d5c292f4073a4fde827dd5d256bab Binary files /dev/null and b/outputs/uploaded/2.jpg differ diff --git a/outputs/uploaded/3.jpg b/outputs/uploaded/3.jpg new file mode 100644 index 0000000000000000000000000000000000000000..c2c90342402d5c292f4073a4fde827dd5d256bab Binary files /dev/null and b/outputs/uploaded/3.jpg differ diff --git a/outputs/uploaded/4.jpg b/outputs/uploaded/4.jpg new file mode 100644 index 0000000000000000000000000000000000000000..712d6ede892e1ebe608ed6c4a1f404255afda2cf --- /dev/null +++ b/outputs/uploaded/4.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa30cea1b5c1172c06ce13f4e2f02fabb1b82f321b0b2927ce812f5629bd06f0 +size 1076332 diff --git a/process_food.py b/process_food.py new file mode 100644 index 0000000000000000000000000000000000000000..6acd53da37f76a7f9f34765d72b80290d2abbbf0 --- /dev/null +++ b/process_food.py @@ -0,0 +1,25 @@ +import json +input_path = "/root/huggingface/FoodieQA/FoodieQA/sivqa_tidy.json" # sivqa_tidy.json所在位置 +output_path = "/root/huggingface/FoodieQA/FoodieQA/sivqa_llava.json" # 输出文件位置 + +with open(input_path, 'r', encoding='utf-8') as f: + foodqa = json.load(f) + +llava_format = [] +for data in foodqa: + llava_format.append({ + "image": data['food_meta']['food_file'], + "conversations": [ + { + "from": "human", + "value": data['question']+"\n" + }, + { + "from": "gpt", + "value": data['choices'][int(data['answer'])] + ",图中的菜是"+ data['food_meta']['food_name'] + } + ] + }) + +with open(output_path, 'w', encoding='utf-8') as f: + json.dump(llava_format, f, indent=4, ensure_ascii=False) \ No newline at end of file diff --git a/readme.md b/readme.md new file mode 100644 index 0000000000000000000000000000000000000000..9a10f76ebcb39edbab932f1fbdca9fb2ed7ef004 --- /dev/null +++ b/readme.md @@ -0,0 +1,7 @@ +# 书生大模型实战营 +## \[进阶岛\] InternVL 多模态模型部署微调实践 + +* `demo` 网页应用依赖相关文件 +* `demo.py` 运行网页应用的入口 +* `utils.py` 相关工具函数 +* `process_food.py` 处理数据集 \ No newline at end of file diff --git a/utils.py b/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..be4d578eb65559e4d083d351ae24533def9d05af --- /dev/null +++ b/utils.py @@ -0,0 +1,25 @@ +import os +import json +import logging +from datetime import datetime + + +def load_json(file_name: str): + if isinstance(file_name, str) and file_name.endswith("json"): + with open(file_name, 'r') as file: + data = json.load(file) + else: + raise ValueError("The file path you passed in is not a json file path.") + + return data + +def init_logger(outputs_dir): + current_time = datetime.now().strftime("%b%d_%H-%M-%S") + os.makedirs(os.path.join(outputs_dir, "logs"), exist_ok=True) + log_path = os.path.join(outputs_dir, "logs", "{}.txt".format(current_time)) + logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s || %(message)s", + handlers=[logging.StreamHandler(), logging.FileHandler(log_path)], + ) + \ No newline at end of file diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241225_111529/20241225_111529.log b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241225_111529/20241225_111529.log new file mode 100644 index 0000000000000000000000000000000000000000..28acfaf5b11db54ea48dd71bc06cbf3297c200ce --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241225_111529/20241225_111529.log @@ -0,0 +1,431 @@ +2024/12/25 11:15:30 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 325847556 + GPU 0: NVIDIA A100-SXM4-80GB + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.2, V12.2.140 + GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 + PyTorch: 2.4.1+cu121 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v3.4.2 (Git Hash 1137e04ec0b5251ca2b4400a4fd3c667ce843d67) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX512 + - CUDA Runtime 12.1 + - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90 + - CuDNN 90.1 (built against CUDA 12.4) + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=9.1.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=2.4.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, + + TorchVision: 0.19.1+cu121 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + launcher: none + randomness: {'seed': None, 'deterministic': False} + cudnn_benchmark: False + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: None + deterministic: False + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/12/25 11:15:30 - mmengine - INFO - Config: +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' + +2024/12/25 11:15:31 - mmengine - WARNING - Failed to search registry with scope "mmengine" in the "builder" registry tree. As a workaround, the current "builder" registry in "xtuner" is used to build instance. This may cause unexpected failure when running the built modules. Please check whether "mmengine" is a correct scope, or whether the registry is initialized. +2024/12/25 11:15:31 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DatasetInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) DatasetInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) DatasetInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/12/25 11:15:31 - mmengine - INFO - Starting to loading data and calc length +2024/12/25 11:15:31 - mmengine - INFO - =======Starting to process /root/share/datasets/FoodieQA/sivqa_llava.json ======= +2024/12/25 11:15:31 - mmengine - INFO - =======total 256 samples of /root/share/datasets/FoodieQA/sivqa_llava.json======= +2024/12/25 11:15:31 - mmengine - INFO - end loading data and calc length +2024/12/25 11:15:31 - mmengine - INFO - =======total 256 samples======= +2024/12/25 11:15:32 - mmengine - INFO - LengthGroupedSampler is used. +2024/12/25 11:15:32 - mmengine - INFO - LengthGroupedSampler construction is complete, and the selected attribute is modality_length +2024/12/25 11:15:32 - mmengine - WARNING - Dataset InternVL_V1_5_Dataset has no metainfo. ``dataset_meta`` in visualizer will be None. +2024/12/25 11:15:32 - mmengine - INFO - Start to load InternVL_V1_5 model. +2024/12/25 11:16:14 - mmengine - INFO - InternVL_V1_5( + (data_preprocessor): BaseDataPreprocessor() + (model): InternVLChatModel( + (vision_model): InternVisionModel( + (embeddings): InternVisionEmbeddings( + (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + ) + (encoder): InternVisionEncoder( + (layers): ModuleList( + (0-23): 24 x InternVisionEncoderLayer( + (attn): InternAttention( + (qkv): Linear(in_features=1024, out_features=3072, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=1024, out_features=1024, bias=True) + ) + (mlp): InternMLP( + (act): GELUActivation() + (fc1): Linear(in_features=1024, out_features=4096, bias=True) + (fc2): Linear(in_features=4096, out_features=1024, bias=True) + ) + (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (drop_path1): Identity() + (drop_path2): Identity() + ) + ) + ) + ) + (language_model): PeftModelForCausalLM( + (base_model): LoraModel( + (model): InternLM2ForCausalLM( + (model): InternLM2Model( + (tok_embeddings): Embedding(92553, 2048, padding_idx=2) + (layers): ModuleList( + (0-23): 24 x InternLM2DecoderLayer( + (attention): InternLM2Attention( + (wqkv): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=4096, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=4096, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (wo): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=2048, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=2048, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (rotary_emb): InternLM2DynamicNTKScalingRotaryEmbedding() + ) + (feed_forward): InternLM2MLP( + (w1): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=8192, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=8192, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (w3): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=8192, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=8192, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (w2): lora.Linear( + (base_layer): Linear(in_features=8192, out_features=2048, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=8192, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=2048, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (act_fn): SiLU() + ) + (attention_norm): InternLM2RMSNorm() + (ffn_norm): InternLM2RMSNorm() + ) + ) + (norm): InternLM2RMSNorm() + ) + (output): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=92553, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=92553, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + ) + ) + ) + (mlp1): Sequential( + (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + (1): Linear(in_features=4096, out_features=2048, bias=True) + (2): GELU(approximate='none') + (3): Linear(in_features=2048, out_features=2048, bias=True) + ) + ) +) +2024/12/25 11:16:14 - mmengine - INFO - InternVL_V1_5 construction is complete diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241225_111529/vis_data/config.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241225_111529/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..1e03a56b0d09f5f7e3fe173adf5b8088953c1cea --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241225_111529/vis_data/config.py @@ -0,0 +1,139 @@ +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/20241226_091548.log b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/20241226_091548.log new file mode 100644 index 0000000000000000000000000000000000000000..858516b8bcb23ad2091d09447923d9d253be5563 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/20241226_091548.log @@ -0,0 +1,486 @@ +2024/12/26 09:15:49 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 127347971 + GPU 0: NVIDIA A100-SXM4-80GB + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.2, V12.2.140 + GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 + PyTorch: 2.4.1+cu121 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v3.4.2 (Git Hash 1137e04ec0b5251ca2b4400a4fd3c667ce843d67) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX512 + - CUDA Runtime 12.1 + - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90 + - CuDNN 90.1 (built against CUDA 12.4) + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=9.1.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=2.4.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, + + TorchVision: 0.19.1+cu121 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + launcher: none + randomness: {'seed': None, 'deterministic': False} + cudnn_benchmark: False + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: None + deterministic: False + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/12/26 09:15:49 - mmengine - INFO - Config: +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' + +2024/12/26 09:15:49 - mmengine - WARNING - Failed to search registry with scope "mmengine" in the "builder" registry tree. As a workaround, the current "builder" registry in "xtuner" is used to build instance. This may cause unexpected failure when running the built modules. Please check whether "mmengine" is a correct scope, or whether the registry is initialized. +2024/12/26 09:15:50 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DatasetInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) DatasetInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) DatasetInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/12/26 09:15:50 - mmengine - INFO - Starting to loading data and calc length +2024/12/26 09:15:50 - mmengine - INFO - =======Starting to process /root/share/datasets/FoodieQA/sivqa_llava.json ======= +2024/12/26 09:15:50 - mmengine - INFO - =======total 256 samples of /root/share/datasets/FoodieQA/sivqa_llava.json======= +2024/12/26 09:15:50 - mmengine - INFO - end loading data and calc length +2024/12/26 09:15:50 - mmengine - INFO - =======total 256 samples======= +2024/12/26 09:15:50 - mmengine - INFO - LengthGroupedSampler is used. +2024/12/26 09:15:50 - mmengine - INFO - LengthGroupedSampler construction is complete, and the selected attribute is modality_length +2024/12/26 09:15:50 - mmengine - WARNING - Dataset InternVL_V1_5_Dataset has no metainfo. ``dataset_meta`` in visualizer will be None. +2024/12/26 09:15:51 - mmengine - INFO - Start to load InternVL_V1_5 model. +2024/12/26 09:16:14 - mmengine - INFO - InternVL_V1_5( + (data_preprocessor): BaseDataPreprocessor() + (model): InternVLChatModel( + (vision_model): InternVisionModel( + (embeddings): InternVisionEmbeddings( + (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + ) + (encoder): InternVisionEncoder( + (layers): ModuleList( + (0-23): 24 x InternVisionEncoderLayer( + (attn): InternAttention( + (qkv): Linear(in_features=1024, out_features=3072, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=1024, out_features=1024, bias=True) + ) + (mlp): InternMLP( + (act): GELUActivation() + (fc1): Linear(in_features=1024, out_features=4096, bias=True) + (fc2): Linear(in_features=4096, out_features=1024, bias=True) + ) + (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (drop_path1): Identity() + (drop_path2): Identity() + ) + ) + ) + ) + (language_model): PeftModelForCausalLM( + (base_model): LoraModel( + (model): InternLM2ForCausalLM( + (model): InternLM2Model( + (tok_embeddings): Embedding(92553, 2048, padding_idx=2) + (layers): ModuleList( + (0-23): 24 x InternLM2DecoderLayer( + (attention): InternLM2Attention( + (wqkv): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=4096, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=4096, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (wo): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=2048, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=2048, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (rotary_emb): InternLM2DynamicNTKScalingRotaryEmbedding() + ) + (feed_forward): InternLM2MLP( + (w1): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=8192, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=8192, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (w3): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=8192, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=8192, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (w2): lora.Linear( + (base_layer): Linear(in_features=8192, out_features=2048, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=8192, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=2048, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (act_fn): SiLU() + ) + (attention_norm): InternLM2RMSNorm() + (ffn_norm): InternLM2RMSNorm() + ) + ) + (norm): InternLM2RMSNorm() + ) + (output): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=92553, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=92553, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + ) + ) + ) + (mlp1): Sequential( + (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + (1): Linear(in_features=4096, out_features=2048, bias=True) + (2): GELU(approximate='none') + (3): Linear(in_features=2048, out_features=2048, bias=True) + ) + ) +) +2024/12/26 09:16:14 - mmengine - INFO - InternVL_V1_5 construction is complete +2024/12/26 09:16:21 - mmengine - INFO - Num train samples 256 +2024/12/26 09:16:21 - mmengine - INFO - train example: +2024/12/26 09:16:22 - mmengine - INFO - <|im_start|> system +You are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user + +图片中的食物通常属于哪个菜系?<|im_end|><|im_start|> assistant +新疆菜,图中的菜是烤羊肉串<|im_end|> +2024/12/26 09:16:22 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io +2024/12/26 09:16:22 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future. +2024/12/26 09:16:22 - mmengine - INFO - Checkpoints will be saved to /root/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food. +2024/12/26 09:17:59 - mmengine - INFO - Iter(train) [ 10/640] lr: 1.5000e-05 eta: 1:41:37 time: 9.6788 data_time: 0.0216 memory: 25144 loss: 5.0813 +2024/12/26 09:19:26 - mmengine - INFO - Iter(train) [ 20/640] lr: 3.0000e-05 eta: 1:34:52 time: 8.6854 data_time: 0.0283 memory: 25158 loss: 2.7990 +2024/12/26 09:20:53 - mmengine - INFO - Iter(train) [ 30/640] lr: 2.9981e-05 eta: 1:31:54 time: 8.7560 data_time: 0.0233 memory: 25135 loss: 1.6280 +2024/12/26 09:22:17 - mmengine - INFO - Iter(train) [ 40/640] lr: 2.9923e-05 eta: 1:28:44 time: 8.3757 data_time: 0.0240 memory: 25128 loss: 1.1477 +2024/12/26 09:23:42 - mmengine - INFO - Iter(train) [ 50/640] lr: 2.9828e-05 eta: 1:26:29 time: 8.4815 data_time: 0.0240 memory: 25135 loss: 1.0320 +2024/12/26 09:25:08 - mmengine - INFO - Iter(train) [ 60/640] lr: 2.9694e-05 eta: 1:24:40 time: 8.5829 data_time: 0.0225 memory: 25158 loss: 0.8037 +2024/12/26 09:25:42 - mmengine - INFO - Exp name: internvl_v2_internlm2_2b_lora_finetune_food_20241226_091548 +2024/12/26 09:25:42 - mmengine - INFO - Saving checkpoint at 64 iterations +2024/12/26 09:25:44 - mmengine - WARNING - Reach the end of the dataloader, it will be restarted and continue to iterate. It is recommended to use `mmengine.dataset.InfiniteSampler` to enable the dataloader to iterate infinitely. +2024/12/26 09:26:40 - mmengine - INFO - Iter(train) [ 70/640] lr: 2.9523e-05 eta: 1:23:50 time: 9.2130 data_time: 0.5503 memory: 25130 loss: 0.6388 +2024/12/26 09:28:04 - mmengine - INFO - Iter(train) [ 80/640] lr: 2.9314e-05 eta: 1:21:54 time: 8.4384 data_time: 0.0251 memory: 25158 loss: 0.5423 +2024/12/26 09:29:30 - mmengine - INFO - Iter(train) [ 90/640] lr: 2.9069e-05 eta: 1:20:14 time: 8.5706 data_time: 0.0231 memory: 25140 loss: 0.4734 +2024/12/26 09:30:58 - mmengine - INFO - Iter(train) [100/640] lr: 2.8788e-05 eta: 1:18:51 time: 8.8341 data_time: 0.1080 memory: 25144 loss: 0.4123 +2024/12/26 09:32:24 - mmengine - INFO - Iter(train) [110/640] lr: 2.8472e-05 eta: 1:17:15 time: 8.5880 data_time: 0.0239 memory: 25140 loss: 0.4574 +2024/12/26 09:33:47 - mmengine - INFO - Iter(train) [120/640] lr: 2.8121e-05 eta: 1:15:27 time: 8.2847 data_time: 0.0231 memory: 25154 loss: 0.3164 +2024/12/26 09:34:54 - mmengine - INFO - Saving checkpoint at 128 iterations +2024/12/26 09:35:17 - mmengine - INFO - Iter(train) [130/640] lr: 2.7737e-05 eta: 1:14:13 time: 9.0378 data_time: 0.5783 memory: 25158 loss: 0.3944 +2024/12/26 09:36:44 - mmengine - INFO - Iter(train) [140/640] lr: 2.7320e-05 eta: 1:12:43 time: 8.6575 data_time: 0.0229 memory: 25140 loss: 0.1549 +2024/12/26 09:38:08 - mmengine - INFO - Iter(train) [150/640] lr: 2.6871e-05 eta: 1:11:06 time: 8.4369 data_time: 0.0252 memory: 25144 loss: 0.1513 +2024/12/26 09:39:32 - mmengine - INFO - Iter(train) [160/640] lr: 2.6393e-05 eta: 1:09:29 time: 8.3658 data_time: 0.0236 memory: 25106 loss: 0.1510 +2024/12/26 09:40:59 - mmengine - INFO - Iter(train) [170/640] lr: 2.5885e-05 eta: 1:08:03 time: 8.7153 data_time: 0.0229 memory: 25149 loss: 0.1917 +2024/12/26 09:42:24 - mmengine - INFO - Iter(train) [180/640] lr: 2.5349e-05 eta: 1:06:32 time: 8.5279 data_time: 0.0253 memory: 25174 loss: 0.1392 +2024/12/26 09:43:47 - mmengine - INFO - Iter(train) [190/640] lr: 2.4786e-05 eta: 1:04:57 time: 8.3164 data_time: 0.0229 memory: 25144 loss: 0.1285 +2024/12/26 09:44:04 - mmengine - INFO - Saving checkpoint at 192 iterations +2024/12/26 09:45:19 - mmengine - INFO - Iter(train) [200/640] lr: 2.4199e-05 eta: 1:03:41 time: 9.1506 data_time: 0.5900 memory: 25140 loss: 0.0807 +2024/12/26 09:46:44 - mmengine - INFO - Iter(train) [210/640] lr: 2.3588e-05 eta: 1:02:10 time: 8.5118 data_time: 0.0245 memory: 25135 loss: 0.0460 +2024/12/26 09:48:11 - mmengine - INFO - Iter(train) [220/640] lr: 2.2955e-05 eta: 1:00:44 time: 8.6730 data_time: 0.0236 memory: 25154 loss: 0.0616 +2024/12/26 09:49:36 - mmengine - INFO - Iter(train) [230/640] lr: 2.2302e-05 eta: 0:59:13 time: 8.4753 data_time: 0.0316 memory: 25149 loss: 0.0491 +2024/12/26 09:51:00 - mmengine - INFO - Iter(train) [240/640] lr: 2.1630e-05 eta: 0:57:43 time: 8.4297 data_time: 0.0241 memory: 25140 loss: 0.0819 +2024/12/26 09:52:23 - mmengine - INFO - Iter(train) [250/640] lr: 2.0941e-05 eta: 0:56:11 time: 8.3317 data_time: 0.0233 memory: 25158 loss: 0.0912 +2024/12/26 09:53:14 - mmengine - INFO - Saving checkpoint at 256 iterations +2024/12/26 09:53:54 - mmengine - INFO - Iter(train) [260/640] lr: 2.0237e-05 eta: 0:54:50 time: 9.0374 data_time: 0.5966 memory: 25140 loss: 0.0547 +2024/12/26 09:55:19 - mmengine - INFO - Iter(train) [270/640] lr: 1.9520e-05 eta: 0:53:22 time: 8.5118 data_time: 0.0244 memory: 25144 loss: 0.0172 +2024/12/26 09:56:47 - mmengine - INFO - Iter(train) [280/640] lr: 1.8791e-05 eta: 0:51:57 time: 8.8357 data_time: 0.0224 memory: 25158 loss: 0.0445 +2024/12/26 09:58:18 - mmengine - INFO - Iter(train) [290/640] lr: 1.8052e-05 eta: 0:50:36 time: 9.0649 data_time: 0.0229 memory: 25154 loss: 0.0170 +2024/12/26 09:59:45 - mmengine - INFO - Iter(train) [300/640] lr: 1.7305e-05 eta: 0:49:10 time: 8.7547 data_time: 0.0243 memory: 25154 loss: 0.0094 +2024/12/26 10:01:15 - mmengine - INFO - Iter(train) [310/640] lr: 1.6553e-05 eta: 0:47:46 time: 8.9967 data_time: 0.0229 memory: 25135 loss: 0.0349 +2024/12/26 10:02:43 - mmengine - INFO - Iter(train) [320/640] lr: 1.5796e-05 eta: 0:46:21 time: 8.7930 data_time: 0.0227 memory: 25135 loss: 0.0370 +2024/12/26 10:02:43 - mmengine - INFO - Saving checkpoint at 320 iterations +2024/12/26 10:04:19 - mmengine - INFO - Iter(train) [330/640] lr: 1.5038e-05 eta: 0:45:02 time: 9.5633 data_time: 0.7590 memory: 25144 loss: 0.0024 +2024/12/26 10:05:48 - mmengine - INFO - Iter(train) [340/640] lr: 1.4279e-05 eta: 0:43:36 time: 8.8874 data_time: 0.0234 memory: 25140 loss: 0.0017 +2024/12/26 10:07:14 - mmengine - INFO - Iter(train) [350/640] lr: 1.3523e-05 eta: 0:42:08 time: 8.5941 data_time: 0.0252 memory: 25154 loss: 0.0112 +2024/12/26 10:08:38 - mmengine - INFO - Iter(train) [360/640] lr: 1.2770e-05 eta: 0:40:39 time: 8.4659 data_time: 0.0263 memory: 25154 loss: 0.0043 +2024/12/26 10:10:01 - mmengine - INFO - Iter(train) [370/640] lr: 1.2022e-05 eta: 0:39:08 time: 8.2607 data_time: 0.0270 memory: 25163 loss: 0.0044 +2024/12/26 10:11:24 - mmengine - INFO - Iter(train) [380/640] lr: 1.1283e-05 eta: 0:37:39 time: 8.3497 data_time: 0.0248 memory: 25149 loss: 0.0102 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/20241226_091548.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/20241226_091548.json new file mode 100644 index 0000000000000000000000000000000000000000..2bf97dfcf0f090549b2503fda7765809907bae17 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/20241226_091548.json @@ -0,0 +1,38 @@ +{"lr": 1.500015e-05, "data_time": 0.021595048904418945, "loss": 5.081307458877563, "time": 9.678821516036987, "iter": 10, "memory": 25144, "step": 10} +{"lr": 3e-05, "data_time": 0.02830979824066162, "loss": 2.798967719078064, "time": 8.685412645339966, "iter": 20, "memory": 25158, "step": 20} +{"lr": 2.99808095489134e-05, "data_time": 0.023261833190917968, "loss": 1.627983421087265, "time": 8.756024384498597, "iter": 30, "memory": 25135, "step": 30} +{"lr": 2.9923287298775314e-05, "data_time": 0.024034762382507326, "loss": 1.1477288007736206, "time": 8.375715494155884, "iter": 40, "memory": 25128, "step": 40} +{"lr": 2.9827580433309446e-05, "data_time": 0.024009037017822265, "loss": 1.0320466876029968, "time": 8.48148512840271, "iter": 50, "memory": 25135, "step": 50} +{"lr": 2.9693933840238504e-05, "data_time": 0.022455358505249025, "loss": 0.8036810219287872, "time": 8.582894492149354, "iter": 60, "memory": 25158, "step": 60} +{"lr": 2.952268948468346e-05, "data_time": 0.5502834320068359, "loss": 0.6387895777821541, "time": 9.212956261634826, "iter": 70, "memory": 25130, "step": 70} +{"lr": 2.9314285534168186e-05, "data_time": 0.025134897232055663, "loss": 0.5422727972269058, "time": 8.438357257843018, "iter": 80, "memory": 25158, "step": 80} +{"lr": 2.90692552374685e-05, "data_time": 0.023055315017700195, "loss": 0.47342642694711684, "time": 8.570553088188172, "iter": 90, "memory": 25140, "step": 90} +{"lr": 2.8788225560174216e-05, "data_time": 0.10801157951354981, "loss": 0.4123392358422279, "time": 8.834052085876465, "iter": 100, "memory": 25144, "step": 100} +{"lr": 2.847191558045545e-05, "data_time": 0.023906779289245606, "loss": 0.4574140504002571, "time": 8.58795998096466, "iter": 110, "memory": 25140, "step": 110} +{"lr": 2.8121134649138086e-05, "data_time": 0.023064517974853517, "loss": 0.3164264470338821, "time": 8.284708142280579, "iter": 120, "memory": 25154, "step": 120} +{"lr": 2.7736780318796056e-05, "data_time": 0.5783360004425049, "loss": 0.39438126869499684, "time": 9.03779730796814, "iter": 130, "memory": 25158, "step": 130} +{"lr": 2.7319836047159543e-05, "data_time": 0.022910189628601075, "loss": 0.15488401651382447, "time": 8.657539463043213, "iter": 140, "memory": 25140, "step": 140} +{"lr": 2.68713686807153e-05, "data_time": 0.02519075870513916, "loss": 0.1512989214155823, "time": 8.436856436729432, "iter": 150, "memory": 25144, "step": 150} +{"lr": 2.639252572493797e-05, "data_time": 0.023575544357299805, "loss": 0.15096470043063165, "time": 8.365842866897584, "iter": 160, "memory": 25106, "step": 160} +{"lr": 2.5884532408136998e-05, "data_time": 0.02287259101867676, "loss": 0.19168278705328703, "time": 8.715261340141296, "iter": 170, "memory": 25149, "step": 170} +{"lr": 2.534868854643217e-05, "data_time": 0.025314021110534667, "loss": 0.1392207656055689, "time": 8.527868676185609, "iter": 180, "memory": 25174, "step": 180} +{"lr": 2.4786365217879254e-05, "data_time": 0.022918891906738282, "loss": 0.12854969091713428, "time": 8.316402006149293, "iter": 190, "memory": 25144, "step": 190} +{"lr": 2.419900125425576e-05, "data_time": 0.5899852514266968, "loss": 0.08068367335945367, "time": 9.150607562065124, "iter": 200, "memory": 25140, "step": 200} +{"lr": 2.3588099559483543e-05, "data_time": 0.024477577209472655, "loss": 0.045956420013681054, "time": 8.511816692352294, "iter": 210, "memory": 25135, "step": 210} +{"lr": 2.2955223264108254e-05, "data_time": 0.02362987995147705, "loss": 0.061592530878260734, "time": 8.672991347312927, "iter": 220, "memory": 25154, "step": 220} +{"lr": 2.2301991725675243e-05, "data_time": 0.03161656856536865, "loss": 0.049098231643438336, "time": 8.475338077545166, "iter": 230, "memory": 25149, "step": 230} +{"lr": 2.163007638523606e-05, "data_time": 0.024078869819641115, "loss": 0.08190386234782636, "time": 8.429711270332337, "iter": 240, "memory": 25140, "step": 240} +{"lr": 2.094119649058736e-05, "data_time": 0.02326529026031494, "loss": 0.09116816509049386, "time": 8.331741666793823, "iter": 250, "memory": 25158, "step": 250} +{"lr": 2.0237114697185536e-05, "data_time": 0.596574854850769, "loss": 0.05472523393109441, "time": 9.037416434288025, "iter": 260, "memory": 25140, "step": 260} +{"lr": 1.9519632557992884e-05, "data_time": 0.024390649795532227, "loss": 0.017242011532653125, "time": 8.511841464042664, "iter": 270, "memory": 25144, "step": 270} +{"lr": 1.8790585913795754e-05, "data_time": 0.022441720962524413, "loss": 0.044481908105080945, "time": 8.835672569274902, "iter": 280, "memory": 25158, "step": 280} +{"lr": 1.8051840195789513e-05, "data_time": 0.02289402484893799, "loss": 0.016986915236338974, "time": 9.064851760864258, "iter": 290, "memory": 25154, "step": 290} +{"lr": 1.7305285652449754e-05, "data_time": 0.024278950691223145, "loss": 0.009410060296067968, "time": 8.754712629318238, "iter": 300, "memory": 25154, "step": 300} +{"lr": 1.6552832512902796e-05, "data_time": 0.02286374568939209, "loss": 0.03494289379450492, "time": 8.996715474128724, "iter": 310, "memory": 25135, "step": 310} +{"lr": 1.579640609917124e-05, "data_time": 0.02268562316894531, "loss": 0.03703032763442025, "time": 8.792994403839112, "iter": 320, "memory": 25135, "step": 320} +{"lr": 1.5037941899800858e-05, "data_time": 0.7590210676193238, "loss": 0.002382425547693856, "time": 9.563341665267945, "iter": 330, "memory": 25144, "step": 330} +{"lr": 1.4279380617474167e-05, "data_time": 0.023433160781860352, "loss": 0.0017295103811193258, "time": 8.887393450737, "iter": 340, "memory": 25140, "step": 340} +{"lr": 1.3522663203282473e-05, "data_time": 0.025190973281860353, "loss": 0.011237028567120434, "time": 8.594102144241333, "iter": 350, "memory": 25154, "step": 350} +{"lr": 1.2769725890362214e-05, "data_time": 0.026250243186950684, "loss": 0.004318543081171811, "time": 8.465864157676696, "iter": 360, "memory": 25154, "step": 360} +{"lr": 1.2022495239603391e-05, "data_time": 0.02704763412475586, "loss": 0.004369065738865174, "time": 8.260673761367798, "iter": 370, "memory": 25163, "step": 370} +{"lr": 1.1282883210106502e-05, "data_time": 0.024756717681884765, "loss": 0.010165342743857764, "time": 8.349717259407043, "iter": 380, "memory": 25149, "step": 380} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/config.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..1e03a56b0d09f5f7e3fe173adf5b8088953c1cea --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/config.py @@ -0,0 +1,139 @@ +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/scalars.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/scalars.json new file mode 100644 index 0000000000000000000000000000000000000000..2bf97dfcf0f090549b2503fda7765809907bae17 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_091548/vis_data/scalars.json @@ -0,0 +1,38 @@ +{"lr": 1.500015e-05, "data_time": 0.021595048904418945, "loss": 5.081307458877563, "time": 9.678821516036987, "iter": 10, "memory": 25144, "step": 10} +{"lr": 3e-05, "data_time": 0.02830979824066162, "loss": 2.798967719078064, "time": 8.685412645339966, "iter": 20, "memory": 25158, "step": 20} +{"lr": 2.99808095489134e-05, "data_time": 0.023261833190917968, "loss": 1.627983421087265, "time": 8.756024384498597, "iter": 30, "memory": 25135, "step": 30} +{"lr": 2.9923287298775314e-05, "data_time": 0.024034762382507326, "loss": 1.1477288007736206, "time": 8.375715494155884, "iter": 40, "memory": 25128, "step": 40} +{"lr": 2.9827580433309446e-05, "data_time": 0.024009037017822265, "loss": 1.0320466876029968, "time": 8.48148512840271, "iter": 50, "memory": 25135, "step": 50} +{"lr": 2.9693933840238504e-05, "data_time": 0.022455358505249025, "loss": 0.8036810219287872, "time": 8.582894492149354, "iter": 60, "memory": 25158, "step": 60} +{"lr": 2.952268948468346e-05, "data_time": 0.5502834320068359, "loss": 0.6387895777821541, "time": 9.212956261634826, "iter": 70, "memory": 25130, "step": 70} +{"lr": 2.9314285534168186e-05, "data_time": 0.025134897232055663, "loss": 0.5422727972269058, "time": 8.438357257843018, "iter": 80, "memory": 25158, "step": 80} +{"lr": 2.90692552374685e-05, "data_time": 0.023055315017700195, "loss": 0.47342642694711684, "time": 8.570553088188172, "iter": 90, "memory": 25140, "step": 90} +{"lr": 2.8788225560174216e-05, "data_time": 0.10801157951354981, "loss": 0.4123392358422279, "time": 8.834052085876465, "iter": 100, "memory": 25144, "step": 100} +{"lr": 2.847191558045545e-05, "data_time": 0.023906779289245606, "loss": 0.4574140504002571, "time": 8.58795998096466, "iter": 110, "memory": 25140, "step": 110} +{"lr": 2.8121134649138086e-05, "data_time": 0.023064517974853517, "loss": 0.3164264470338821, "time": 8.284708142280579, "iter": 120, "memory": 25154, "step": 120} +{"lr": 2.7736780318796056e-05, "data_time": 0.5783360004425049, "loss": 0.39438126869499684, "time": 9.03779730796814, "iter": 130, "memory": 25158, "step": 130} +{"lr": 2.7319836047159543e-05, "data_time": 0.022910189628601075, "loss": 0.15488401651382447, "time": 8.657539463043213, "iter": 140, "memory": 25140, "step": 140} +{"lr": 2.68713686807153e-05, "data_time": 0.02519075870513916, "loss": 0.1512989214155823, "time": 8.436856436729432, "iter": 150, "memory": 25144, "step": 150} +{"lr": 2.639252572493797e-05, "data_time": 0.023575544357299805, "loss": 0.15096470043063165, "time": 8.365842866897584, "iter": 160, "memory": 25106, "step": 160} +{"lr": 2.5884532408136998e-05, "data_time": 0.02287259101867676, "loss": 0.19168278705328703, "time": 8.715261340141296, "iter": 170, "memory": 25149, "step": 170} +{"lr": 2.534868854643217e-05, "data_time": 0.025314021110534667, "loss": 0.1392207656055689, "time": 8.527868676185609, "iter": 180, "memory": 25174, "step": 180} +{"lr": 2.4786365217879254e-05, "data_time": 0.022918891906738282, "loss": 0.12854969091713428, "time": 8.316402006149293, "iter": 190, "memory": 25144, "step": 190} +{"lr": 2.419900125425576e-05, "data_time": 0.5899852514266968, "loss": 0.08068367335945367, "time": 9.150607562065124, "iter": 200, "memory": 25140, "step": 200} +{"lr": 2.3588099559483543e-05, "data_time": 0.024477577209472655, "loss": 0.045956420013681054, "time": 8.511816692352294, "iter": 210, "memory": 25135, "step": 210} +{"lr": 2.2955223264108254e-05, "data_time": 0.02362987995147705, "loss": 0.061592530878260734, "time": 8.672991347312927, "iter": 220, "memory": 25154, "step": 220} +{"lr": 2.2301991725675243e-05, "data_time": 0.03161656856536865, "loss": 0.049098231643438336, "time": 8.475338077545166, "iter": 230, "memory": 25149, "step": 230} +{"lr": 2.163007638523606e-05, "data_time": 0.024078869819641115, "loss": 0.08190386234782636, "time": 8.429711270332337, "iter": 240, "memory": 25140, "step": 240} +{"lr": 2.094119649058736e-05, "data_time": 0.02326529026031494, "loss": 0.09116816509049386, "time": 8.331741666793823, "iter": 250, "memory": 25158, "step": 250} +{"lr": 2.0237114697185536e-05, "data_time": 0.596574854850769, "loss": 0.05472523393109441, "time": 9.037416434288025, "iter": 260, "memory": 25140, "step": 260} +{"lr": 1.9519632557992884e-05, "data_time": 0.024390649795532227, "loss": 0.017242011532653125, "time": 8.511841464042664, "iter": 270, "memory": 25144, "step": 270} +{"lr": 1.8790585913795754e-05, "data_time": 0.022441720962524413, "loss": 0.044481908105080945, "time": 8.835672569274902, "iter": 280, "memory": 25158, "step": 280} +{"lr": 1.8051840195789513e-05, "data_time": 0.02289402484893799, "loss": 0.016986915236338974, "time": 9.064851760864258, "iter": 290, "memory": 25154, "step": 290} +{"lr": 1.7305285652449754e-05, "data_time": 0.024278950691223145, "loss": 0.009410060296067968, "time": 8.754712629318238, "iter": 300, "memory": 25154, "step": 300} +{"lr": 1.6552832512902796e-05, "data_time": 0.02286374568939209, "loss": 0.03494289379450492, "time": 8.996715474128724, "iter": 310, "memory": 25135, "step": 310} +{"lr": 1.579640609917124e-05, "data_time": 0.02268562316894531, "loss": 0.03703032763442025, "time": 8.792994403839112, "iter": 320, "memory": 25135, "step": 320} +{"lr": 1.5037941899800858e-05, "data_time": 0.7590210676193238, "loss": 0.002382425547693856, "time": 9.563341665267945, "iter": 330, "memory": 25144, "step": 330} +{"lr": 1.4279380617474167e-05, "data_time": 0.023433160781860352, "loss": 0.0017295103811193258, "time": 8.887393450737, "iter": 340, "memory": 25140, "step": 340} +{"lr": 1.3522663203282473e-05, "data_time": 0.025190973281860353, "loss": 0.011237028567120434, "time": 8.594102144241333, "iter": 350, "memory": 25154, "step": 350} +{"lr": 1.2769725890362214e-05, "data_time": 0.026250243186950684, "loss": 0.004318543081171811, "time": 8.465864157676696, "iter": 360, "memory": 25154, "step": 360} +{"lr": 1.2022495239603391e-05, "data_time": 0.02704763412475586, "loss": 0.004369065738865174, "time": 8.260673761367798, "iter": 370, "memory": 25163, "step": 370} +{"lr": 1.1282883210106502e-05, "data_time": 0.024756717681884765, "loss": 0.010165342743857764, "time": 8.349717259407043, "iter": 380, "memory": 25149, "step": 380} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/20241226_105245.log b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/20241226_105245.log new file mode 100644 index 0000000000000000000000000000000000000000..1f567d13a3e3afa923934cf13d8c3aa47e1052f7 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/20241226_105245.log @@ -0,0 +1,517 @@ +2024/12/26 10:52:46 - mmengine - INFO - +------------------------------------------------------------ +System environment: + sys.platform: linux + Python: 3.10.16 (main, Dec 11 2024, 16:24:50) [GCC 11.2.0] + CUDA available: True + MUSA available: False + numpy_random_seed: 1403011837 + GPU 0: NVIDIA A100-SXM4-80GB + CUDA_HOME: /usr/local/cuda + NVCC: Cuda compilation tools, release 12.2, V12.2.140 + GCC: gcc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 + PyTorch: 2.4.1+cu121 + PyTorch compiling details: PyTorch built with: + - GCC 9.3 + - C++ Version: 201703 + - Intel(R) oneAPI Math Kernel Library Version 2022.2-Product Build 20220804 for Intel(R) 64 architecture applications + - Intel(R) MKL-DNN v3.4.2 (Git Hash 1137e04ec0b5251ca2b4400a4fd3c667ce843d67) + - OpenMP 201511 (a.k.a. OpenMP 4.5) + - LAPACK is enabled (usually provided by MKL) + - NNPACK is enabled + - CPU capability usage: AVX512 + - CUDA Runtime 12.1 + - NVCC architecture flags: -gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90 + - CuDNN 90.1 (built against CUDA 12.4) + - Magma 2.6.1 + - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CUDA_VERSION=12.1, CUDNN_VERSION=9.1.0, CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 -fabi-version=11 -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -O2 -fPIC -Wall -Wextra -Werror=return-type -Werror=non-virtual-dtor -Werror=bool-operation -Wnarrowing -Wno-missing-field-initializers -Wno-type-limits -Wno-array-bounds -Wno-unknown-pragmas -Wno-unused-parameter -Wno-unused-function -Wno-unused-result -Wno-strict-overflow -Wno-strict-aliasing -Wno-stringop-overflow -Wsuggest-override -Wno-psabi -Wno-error=pedantic -Wno-error=old-style-cast -Wno-missing-braces -fdiagnostics-color=always -faligned-new -Wno-unused-but-set-variable -Wno-maybe-uninitialized -fno-math-errno -fno-trapping-math -Werror=format -Wno-stringop-overflow, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PERF_WITH_AVX512=1, TORCH_VERSION=2.4.1, USE_CUDA=ON, USE_CUDNN=ON, USE_CUSPARSELT=1, USE_EXCEPTION_PTR=1, USE_GFLAGS=OFF, USE_GLOG=OFF, USE_GLOO=ON, USE_MKL=ON, USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=1, USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, USE_ROCM_KERNEL_ASSERT=OFF, + + TorchVision: 0.19.1+cu121 + OpenCV: 4.10.0 + MMEngine: 0.10.5 + +Runtime environment: + launcher: none + randomness: {'seed': None, 'deterministic': False} + cudnn_benchmark: False + mp_cfg: {'mp_start_method': 'fork', 'opencv_num_threads': 0} + dist_cfg: {'backend': 'nccl'} + seed: None + deterministic: False + Distributed launcher: none + Distributed training: False + GPU number: 1 +------------------------------------------------------------ + +2024/12/26 10:52:46 - mmengine - INFO - Config: +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' + +2024/12/26 10:52:46 - mmengine - WARNING - Failed to search registry with scope "mmengine" in the "builder" registry tree. As a workaround, the current "builder" registry in "xtuner" is used to build instance. This may cause unexpected failure when running the built modules. Please check whether "mmengine" is a correct scope, or whether the registry is initialized. +2024/12/26 10:52:46 - mmengine - INFO - Hooks will be executed in the following order: +before_run: +(VERY_HIGH ) RuntimeInfoHook +(BELOW_NORMAL) LoggerHook + -------------------- +before_train: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DatasetInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_train_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(NORMAL ) DistSamplerSeedHook + -------------------- +before_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook + -------------------- +after_train_iter: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_train_epoch: +(NORMAL ) IterTimerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +before_val: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) DatasetInfoHook + -------------------- +before_val_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_val_iter: +(NORMAL ) IterTimerHook + -------------------- +after_val_iter: +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_val_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook +(LOW ) ParamSchedulerHook +(VERY_LOW ) CheckpointHook + -------------------- +after_val: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_train: +(VERY_HIGH ) RuntimeInfoHook +(VERY_LOW ) CheckpointHook + -------------------- +before_test: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) DatasetInfoHook + -------------------- +before_test_epoch: +(NORMAL ) IterTimerHook + -------------------- +before_test_iter: +(NORMAL ) IterTimerHook + -------------------- +after_test_iter: +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test_epoch: +(VERY_HIGH ) RuntimeInfoHook +(NORMAL ) IterTimerHook +(BELOW_NORMAL) LoggerHook + -------------------- +after_test: +(VERY_HIGH ) RuntimeInfoHook + -------------------- +after_run: +(BELOW_NORMAL) LoggerHook + -------------------- +2024/12/26 10:52:47 - mmengine - INFO - Starting to loading data and calc length +2024/12/26 10:52:47 - mmengine - INFO - =======Starting to process /root/share/datasets/FoodieQA/sivqa_llava.json ======= +2024/12/26 10:52:47 - mmengine - INFO - =======total 256 samples of /root/share/datasets/FoodieQA/sivqa_llava.json======= +2024/12/26 10:52:47 - mmengine - INFO - end loading data and calc length +2024/12/26 10:52:47 - mmengine - INFO - =======total 256 samples======= +2024/12/26 10:52:47 - mmengine - INFO - LengthGroupedSampler is used. +2024/12/26 10:52:47 - mmengine - INFO - LengthGroupedSampler construction is complete, and the selected attribute is modality_length +2024/12/26 10:52:47 - mmengine - WARNING - Dataset InternVL_V1_5_Dataset has no metainfo. ``dataset_meta`` in visualizer will be None. +2024/12/26 10:52:47 - mmengine - INFO - Start to load InternVL_V1_5 model. +2024/12/26 10:53:11 - mmengine - INFO - InternVL_V1_5( + (data_preprocessor): BaseDataPreprocessor() + (model): InternVLChatModel( + (vision_model): InternVisionModel( + (embeddings): InternVisionEmbeddings( + (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14)) + ) + (encoder): InternVisionEncoder( + (layers): ModuleList( + (0-23): 24 x InternVisionEncoderLayer( + (attn): InternAttention( + (qkv): Linear(in_features=1024, out_features=3072, bias=True) + (attn_drop): Dropout(p=0.0, inplace=False) + (proj_drop): Dropout(p=0.0, inplace=False) + (proj): Linear(in_features=1024, out_features=1024, bias=True) + ) + (mlp): InternMLP( + (act): GELUActivation() + (fc1): Linear(in_features=1024, out_features=4096, bias=True) + (fc2): Linear(in_features=4096, out_features=1024, bias=True) + ) + (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True) + (drop_path1): Identity() + (drop_path2): Identity() + ) + ) + ) + ) + (language_model): PeftModelForCausalLM( + (base_model): LoraModel( + (model): InternLM2ForCausalLM( + (model): InternLM2Model( + (tok_embeddings): Embedding(92553, 2048, padding_idx=2) + (layers): ModuleList( + (0-23): 24 x InternLM2DecoderLayer( + (attention): InternLM2Attention( + (wqkv): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=4096, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=4096, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (wo): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=2048, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=2048, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (rotary_emb): InternLM2DynamicNTKScalingRotaryEmbedding() + ) + (feed_forward): InternLM2MLP( + (w1): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=8192, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=8192, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (w3): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=8192, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=8192, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (w2): lora.Linear( + (base_layer): Linear(in_features=8192, out_features=2048, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=8192, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=2048, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + (act_fn): SiLU() + ) + (attention_norm): InternLM2RMSNorm() + (ffn_norm): InternLM2RMSNorm() + ) + ) + (norm): InternLM2RMSNorm() + ) + (output): lora.Linear( + (base_layer): Linear(in_features=2048, out_features=92553, bias=False) + (lora_dropout): ModuleDict( + (default): Dropout(p=0.05, inplace=False) + ) + (lora_A): ModuleDict( + (default): Linear(in_features=2048, out_features=128, bias=False) + ) + (lora_B): ModuleDict( + (default): Linear(in_features=128, out_features=92553, bias=False) + ) + (lora_embedding_A): ParameterDict() + (lora_embedding_B): ParameterDict() + (lora_magnitude_vector): ModuleDict() + ) + ) + ) + ) + (mlp1): Sequential( + (0): LayerNorm((4096,), eps=1e-05, elementwise_affine=True) + (1): Linear(in_features=4096, out_features=2048, bias=True) + (2): GELU(approximate='none') + (3): Linear(in_features=2048, out_features=2048, bias=True) + ) + ) +) +2024/12/26 10:53:11 - mmengine - INFO - InternVL_V1_5 construction is complete +2024/12/26 10:53:27 - mmengine - INFO - Num train samples 256 +2024/12/26 10:53:27 - mmengine - INFO - train example: +2024/12/26 10:53:28 - mmengine - INFO - <|im_start|> system +You are an AI assistant whose name is InternLM (书生·浦语).<|im_end|><|im_start|>user + +图片中的食物通常属于哪个菜系?<|im_end|><|im_start|> assistant +新疆菜,图中的菜是烤羊肉串<|im_end|> +2024/12/26 10:53:28 - mmengine - WARNING - "FileClient" will be deprecated in future. Please use io functions in https://mmengine.readthedocs.io/en/latest/api/fileio.html#file-io +2024/12/26 10:53:28 - mmengine - WARNING - "HardDiskBackend" is the alias of "LocalBackend" and the former will be deprecated in future. +2024/12/26 10:53:28 - mmengine - INFO - Checkpoints will be saved to /root/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food. +2024/12/26 10:55:15 - mmengine - INFO - Iter(train) [ 10/640] lr: 1.5000e-05 eta: 1:51:41 time: 10.6379 data_time: 0.0244 memory: 25149 loss: 5.2577 +2024/12/26 10:56:41 - mmengine - INFO - Iter(train) [ 20/640] lr: 3.0000e-05 eta: 1:39:34 time: 8.6363 data_time: 0.0394 memory: 25140 loss: 2.6929 +2024/12/26 10:58:06 - mmengine - INFO - Iter(train) [ 30/640] lr: 2.9981e-05 eta: 1:34:03 time: 8.4823 data_time: 0.0267 memory: 25140 loss: 1.3502 +2024/12/26 10:59:30 - mmengine - INFO - Iter(train) [ 40/640] lr: 2.9923e-05 eta: 1:30:24 time: 8.4094 data_time: 0.0919 memory: 25140 loss: 1.0310 +2024/12/26 11:00:54 - mmengine - INFO - Iter(train) [ 50/640] lr: 2.9828e-05 eta: 1:27:37 time: 8.3925 data_time: 0.0306 memory: 25135 loss: 1.0154 +2024/12/26 11:02:16 - mmengine - INFO - Iter(train) [ 60/640] lr: 2.9694e-05 eta: 1:25:05 time: 8.2611 data_time: 0.0247 memory: 25135 loss: 0.9105 +2024/12/26 11:02:49 - mmengine - INFO - Exp name: internvl_v2_internlm2_2b_lora_finetune_food_20241226_105245 +2024/12/26 11:02:49 - mmengine - INFO - Saving checkpoint at 64 iterations +2024/12/26 11:02:51 - mmengine - WARNING - Reach the end of the dataloader, it will be restarted and continue to iterate. It is recommended to use `mmengine.dataset.InfiniteSampler` to enable the dataloader to iterate infinitely. +2024/12/26 11:03:45 - mmengine - INFO - Iter(train) [ 70/640] lr: 2.9523e-05 eta: 1:23:39 time: 8.8224 data_time: 0.5760 memory: 25159 loss: 0.6336 +2024/12/26 11:05:08 - mmengine - INFO - Iter(train) [ 80/640] lr: 2.9314e-05 eta: 1:21:35 time: 8.2903 data_time: 0.0241 memory: 25105 loss: 0.4324 +2024/12/26 11:06:31 - mmengine - INFO - Iter(train) [ 90/640] lr: 2.9069e-05 eta: 1:19:45 time: 8.3838 data_time: 0.0267 memory: 25154 loss: 0.5337 +2024/12/26 11:07:55 - mmengine - INFO - Iter(train) [100/640] lr: 2.8788e-05 eta: 1:18:01 time: 8.3697 data_time: 0.0250 memory: 25154 loss: 0.4361 +2024/12/26 11:09:20 - mmengine - INFO - Iter(train) [110/640] lr: 2.8472e-05 eta: 1:16:23 time: 8.4358 data_time: 0.0351 memory: 25140 loss: 0.4642 +2024/12/26 11:10:44 - mmengine - INFO - Iter(train) [120/640] lr: 2.8121e-05 eta: 1:14:46 time: 8.4055 data_time: 0.0300 memory: 25140 loss: 0.3679 +2024/12/26 11:11:50 - mmengine - INFO - Saving checkpoint at 128 iterations +2024/12/26 11:12:16 - mmengine - INFO - Iter(train) [130/640] lr: 2.7737e-05 eta: 1:13:44 time: 9.2617 data_time: 0.9768 memory: 25141 loss: 0.3529 +2024/12/26 11:13:38 - mmengine - INFO - Iter(train) [140/640] lr: 2.7320e-05 eta: 1:12:00 time: 8.1778 data_time: 0.0249 memory: 25145 loss: 0.1392 +2024/12/26 11:15:02 - mmengine - INFO - Iter(train) [150/640] lr: 2.6871e-05 eta: 1:10:26 time: 8.4131 data_time: 0.0259 memory: 25140 loss: 0.1616 +2024/12/26 11:16:26 - mmengine - INFO - Iter(train) [160/640] lr: 2.6393e-05 eta: 1:08:53 time: 8.4095 data_time: 0.0697 memory: 25135 loss: 0.1733 +2024/12/26 11:17:50 - mmengine - INFO - Iter(train) [170/640] lr: 2.5885e-05 eta: 1:07:19 time: 8.3325 data_time: 0.0238 memory: 25125 loss: 0.1666 +2024/12/26 11:19:13 - mmengine - INFO - Iter(train) [180/640] lr: 2.5349e-05 eta: 1:05:46 time: 8.3152 data_time: 0.0272 memory: 25135 loss: 0.1973 +2024/12/26 11:20:35 - mmengine - INFO - Iter(train) [190/640] lr: 2.4786e-05 eta: 1:04:13 time: 8.2790 data_time: 0.0280 memory: 25135 loss: 0.1474 +2024/12/26 11:20:52 - mmengine - INFO - Saving checkpoint at 192 iterations +2024/12/26 11:22:06 - mmengine - INFO - Iter(train) [200/640] lr: 2.4199e-05 eta: 1:02:59 time: 9.0581 data_time: 0.7110 memory: 25126 loss: 0.0878 +2024/12/26 11:23:29 - mmengine - INFO - Iter(train) [210/640] lr: 2.3588e-05 eta: 1:01:27 time: 8.3146 data_time: 0.0245 memory: 25130 loss: 0.0670 +2024/12/26 11:24:53 - mmengine - INFO - Iter(train) [220/640] lr: 2.2955e-05 eta: 0:59:57 time: 8.3552 data_time: 0.0263 memory: 25135 loss: 0.0770 +2024/12/26 11:26:16 - mmengine - INFO - Iter(train) [230/640] lr: 2.2302e-05 eta: 0:58:28 time: 8.3647 data_time: 0.0260 memory: 25140 loss: 0.0665 +2024/12/26 11:27:41 - mmengine - INFO - Iter(train) [240/640] lr: 2.1630e-05 eta: 0:57:01 time: 8.4819 data_time: 0.0247 memory: 25140 loss: 0.0764 +2024/12/26 11:29:05 - mmengine - INFO - Iter(train) [250/640] lr: 2.0941e-05 eta: 0:55:33 time: 8.4098 data_time: 0.0274 memory: 25140 loss: 0.0761 +2024/12/26 11:29:56 - mmengine - INFO - Saving checkpoint at 256 iterations +2024/12/26 11:30:37 - mmengine - INFO - Iter(train) [260/640] lr: 2.0237e-05 eta: 0:54:17 time: 9.1613 data_time: 0.7790 memory: 25126 loss: 0.0560 +2024/12/26 11:32:00 - mmengine - INFO - Iter(train) [270/640] lr: 1.9520e-05 eta: 0:52:47 time: 8.2784 data_time: 0.0253 memory: 25140 loss: 0.0324 +2024/12/26 11:33:23 - mmengine - INFO - Iter(train) [280/640] lr: 1.8791e-05 eta: 0:51:18 time: 8.2845 data_time: 0.0244 memory: 25145 loss: 0.0314 +2024/12/26 11:34:47 - mmengine - INFO - Iter(train) [290/640] lr: 1.8052e-05 eta: 0:49:51 time: 8.4338 data_time: 0.0249 memory: 25140 loss: 0.0178 +2024/12/26 11:36:12 - mmengine - INFO - Iter(train) [300/640] lr: 1.7305e-05 eta: 0:48:25 time: 8.5110 data_time: 0.0240 memory: 25140 loss: 0.0299 +2024/12/26 11:37:34 - mmengine - INFO - Iter(train) [310/640] lr: 1.6553e-05 eta: 0:46:56 time: 8.1676 data_time: 0.0249 memory: 25154 loss: 0.0364 +2024/12/26 11:38:56 - mmengine - INFO - Iter(train) [320/640] lr: 1.5796e-05 eta: 0:45:27 time: 8.2586 data_time: 0.0249 memory: 25140 loss: 0.0279 +2024/12/26 11:38:56 - mmengine - INFO - Saving checkpoint at 320 iterations +2024/12/26 11:40:26 - mmengine - INFO - Iter(train) [330/640] lr: 1.5038e-05 eta: 0:44:06 time: 8.9712 data_time: 0.5952 memory: 25159 loss: 0.0033 +2024/12/26 11:41:50 - mmengine - INFO - Iter(train) [340/640] lr: 1.4279e-05 eta: 0:42:40 time: 8.3903 data_time: 0.0239 memory: 25140 loss: 0.0103 +2024/12/26 11:43:12 - mmengine - INFO - Iter(train) [350/640] lr: 1.3523e-05 eta: 0:41:12 time: 8.2599 data_time: 0.0250 memory: 25145 loss: 0.0095 +2024/12/26 11:44:35 - mmengine - INFO - Iter(train) [360/640] lr: 1.2770e-05 eta: 0:39:44 time: 8.2206 data_time: 0.0241 memory: 25145 loss: 0.0022 +2024/12/26 11:45:57 - mmengine - INFO - Iter(train) [370/640] lr: 1.2022e-05 eta: 0:38:17 time: 8.2087 data_time: 0.0258 memory: 25149 loss: 0.0124 +2024/12/26 11:47:21 - mmengine - INFO - Iter(train) [380/640] lr: 1.1283e-05 eta: 0:36:51 time: 8.3832 data_time: 0.0244 memory: 25116 loss: 0.0075 +2024/12/26 11:47:55 - mmengine - INFO - Saving checkpoint at 384 iterations +2024/12/26 11:48:52 - mmengine - INFO - Iter(train) [390/640] lr: 1.0553e-05 eta: 0:35:30 time: 9.1668 data_time: 0.6227 memory: 25130 loss: 0.0064 +2024/12/26 11:50:17 - mmengine - INFO - Iter(train) [400/640] lr: 9.8341e-06 eta: 0:34:05 time: 8.4811 data_time: 0.0237 memory: 25101 loss: 0.0017 +2024/12/26 11:51:42 - mmengine - INFO - Iter(train) [410/640] lr: 9.1286e-06 eta: 0:32:39 time: 8.4858 data_time: 0.0246 memory: 25140 loss: 0.0011 +2024/12/26 11:53:04 - mmengine - INFO - Iter(train) [420/640] lr: 8.4381e-06 eta: 0:31:12 time: 8.2089 data_time: 0.0235 memory: 25126 loss: 0.0018 +2024/12/26 11:54:27 - mmengine - INFO - Iter(train) [430/640] lr: 7.7644e-06 eta: 0:29:47 time: 8.3463 data_time: 0.0256 memory: 25140 loss: 0.0016 +2024/12/26 11:55:50 - mmengine - INFO - Iter(train) [440/640] lr: 7.1092e-06 eta: 0:28:20 time: 8.2912 data_time: 0.0270 memory: 25154 loss: 0.0013 +2024/12/26 11:56:56 - mmengine - INFO - Saving checkpoint at 448 iterations +2024/12/26 11:57:19 - mmengine - INFO - Iter(train) [450/640] lr: 6.4742e-06 eta: 0:26:57 time: 8.8783 data_time: 0.6255 memory: 25154 loss: 0.0058 +2024/12/26 11:58:43 - mmengine - INFO - Iter(train) [460/640] lr: 5.8611e-06 eta: 0:25:31 time: 8.4040 data_time: 0.0273 memory: 25130 loss: 0.0004 +2024/12/26 12:00:09 - mmengine - INFO - Iter(train) [470/640] lr: 5.2713e-06 eta: 0:24:07 time: 8.5917 data_time: 0.0295 memory: 25140 loss: 0.0004 +2024/12/26 12:01:34 - mmengine - INFO - Iter(train) [480/640] lr: 4.7064e-06 eta: 0:22:41 time: 8.4686 data_time: 0.0244 memory: 25140 loss: 0.0005 +2024/12/26 12:02:57 - mmengine - INFO - Iter(train) [490/640] lr: 4.1678e-06 eta: 0:21:16 time: 8.2980 data_time: 0.0287 memory: 25145 loss: 0.0004 +2024/12/26 12:04:19 - mmengine - INFO - Iter(train) [500/640] lr: 3.6570e-06 eta: 0:19:50 time: 8.2091 data_time: 0.0297 memory: 25174 loss: 0.0003 +2024/12/26 12:05:41 - mmengine - INFO - Iter(train) [510/640] lr: 3.1752e-06 eta: 0:18:24 time: 8.2422 data_time: 0.0313 memory: 25101 loss: 0.0004 +2024/12/26 12:05:58 - mmengine - INFO - Saving checkpoint at 512 iterations +2024/12/26 12:07:09 - mmengine - INFO - Iter(train) [520/640] lr: 2.7236e-06 eta: 0:17:00 time: 8.7385 data_time: 0.6205 memory: 25154 loss: 0.0003 +2024/12/26 12:08:32 - mmengine - INFO - Iter(train) [530/640] lr: 2.3035e-06 eta: 0:15:34 time: 8.3021 data_time: 0.0238 memory: 25130 loss: 0.0004 +2024/12/26 12:09:55 - mmengine - INFO - Iter(train) [540/640] lr: 1.9158e-06 eta: 0:14:09 time: 8.3019 data_time: 0.0258 memory: 25145 loss: 0.0003 +2024/12/26 12:11:18 - mmengine - INFO - Iter(train) [550/640] lr: 1.5616e-06 eta: 0:12:44 time: 8.3526 data_time: 0.0241 memory: 25140 loss: 0.0004 +2024/12/26 12:12:43 - mmengine - INFO - Iter(train) [560/640] lr: 1.2418e-06 eta: 0:11:19 time: 8.4456 data_time: 0.0240 memory: 25130 loss: 0.0003 +2024/12/26 12:14:06 - mmengine - INFO - Iter(train) [570/640] lr: 9.5724e-07 eta: 0:09:54 time: 8.3325 data_time: 0.0235 memory: 25125 loss: 0.0003 +2024/12/26 12:14:55 - mmengine - INFO - Saving checkpoint at 576 iterations +2024/12/26 12:15:35 - mmengine - INFO - Iter(train) [580/640] lr: 7.0858e-07 eta: 0:08:29 time: 8.8984 data_time: 0.6721 memory: 25140 loss: 0.0003 +2024/12/26 12:16:58 - mmengine - INFO - Iter(train) [590/640] lr: 4.9649e-07 eta: 0:07:04 time: 8.3288 data_time: 0.0239 memory: 25135 loss: 0.0004 +2024/12/26 12:18:20 - mmengine - INFO - Iter(train) [600/640] lr: 3.2151e-07 eta: 0:05:39 time: 8.1961 data_time: 0.0276 memory: 25145 loss: 0.0003 +2024/12/26 12:19:43 - mmengine - INFO - Iter(train) [610/640] lr: 1.8408e-07 eta: 0:04:14 time: 8.2283 data_time: 0.0247 memory: 25130 loss: 0.0003 +2024/12/26 12:21:05 - mmengine - INFO - Iter(train) [620/640] lr: 8.4568e-08 eta: 0:02:49 time: 8.2109 data_time: 0.0248 memory: 25135 loss: 0.0002 +2024/12/26 12:22:26 - mmengine - INFO - Iter(train) [630/640] lr: 2.3219e-08 eta: 0:01:24 time: 8.1565 data_time: 0.0251 memory: 25159 loss: 0.0003 +2024/12/26 12:23:49 - mmengine - INFO - Iter(train) [640/640] lr: 1.9195e-10 eta: 0:00:00 time: 8.2517 data_time: 0.0234 memory: 25145 loss: 0.0004 +2024/12/26 12:23:49 - mmengine - INFO - Saving checkpoint at 640 iterations diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/20241226_105245.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/20241226_105245.json new file mode 100644 index 0000000000000000000000000000000000000000..ac0541074deca4ae6db74afc8566cde807705dcb --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/20241226_105245.json @@ -0,0 +1,64 @@ +{"lr": 1.500015e-05, "data_time": 0.02441577911376953, "loss": 5.257675409317017, "time": 10.637859201431274, "iter": 10, "memory": 25149, "step": 10} +{"lr": 3e-05, "data_time": 0.03938922882080078, "loss": 2.692900276184082, "time": 8.636328339576721, "iter": 20, "memory": 25140, "step": 20} +{"lr": 2.99808095489134e-05, "data_time": 0.026666665077209474, "loss": 1.350151574611664, "time": 8.48230082988739, "iter": 30, "memory": 25140, "step": 30} +{"lr": 2.9923287298775314e-05, "data_time": 0.09188299179077149, "loss": 1.0310316979885101, "time": 8.40936803817749, "iter": 40, "memory": 25140, "step": 40} +{"lr": 2.9827580433309446e-05, "data_time": 0.030604100227355956, "loss": 1.015432846546173, "time": 8.392545104026794, "iter": 50, "memory": 25135, "step": 50} +{"lr": 2.9693933840238504e-05, "data_time": 0.024699020385742187, "loss": 0.9104578614234924, "time": 8.261106276512146, "iter": 60, "memory": 25135, "step": 60} +{"lr": 2.952268948468346e-05, "data_time": 0.5760255098342896, "loss": 0.6336249113082886, "time": 8.82242019176483, "iter": 70, "memory": 25159, "step": 70} +{"lr": 2.9314285534168186e-05, "data_time": 0.024132895469665527, "loss": 0.4324179649353027, "time": 8.290298128128052, "iter": 80, "memory": 25105, "step": 80} +{"lr": 2.90692552374685e-05, "data_time": 0.026720929145812988, "loss": 0.5336983978748322, "time": 8.383773064613342, "iter": 90, "memory": 25154, "step": 90} +{"lr": 2.8788225560174216e-05, "data_time": 0.024952125549316407, "loss": 0.43612043261528016, "time": 8.369651365280152, "iter": 100, "memory": 25154, "step": 100} +{"lr": 2.847191558045545e-05, "data_time": 0.035126781463623045, "loss": 0.464183883368969, "time": 8.43579168319702, "iter": 110, "memory": 25140, "step": 110} +{"lr": 2.8121134649138086e-05, "data_time": 0.029959464073181154, "loss": 0.36793762668967245, "time": 8.405464482307433, "iter": 120, "memory": 25140, "step": 120} +{"lr": 2.7736780318796056e-05, "data_time": 0.9768228530883789, "loss": 0.3529036447405815, "time": 9.261660981178284, "iter": 130, "memory": 25141, "step": 130} +{"lr": 2.7319836047159543e-05, "data_time": 0.024857091903686523, "loss": 0.1392025537788868, "time": 8.177789330482483, "iter": 140, "memory": 25145, "step": 140} +{"lr": 2.68713686807153e-05, "data_time": 0.02593729496002197, "loss": 0.1616042286157608, "time": 8.413124871253967, "iter": 150, "memory": 25140, "step": 150} +{"lr": 2.639252572493797e-05, "data_time": 0.06974592208862304, "loss": 0.17334721721708773, "time": 8.409547686576843, "iter": 160, "memory": 25135, "step": 160} +{"lr": 2.5884532408136998e-05, "data_time": 0.02381100654602051, "loss": 0.16655125971883536, "time": 8.332503461837769, "iter": 170, "memory": 25125, "step": 170} +{"lr": 2.534868854643217e-05, "data_time": 0.027215075492858887, "loss": 0.19725488470867275, "time": 8.315246486663819, "iter": 180, "memory": 25135, "step": 180} +{"lr": 2.4786365217879254e-05, "data_time": 0.028009748458862303, "loss": 0.14739922918379306, "time": 8.278998947143554, "iter": 190, "memory": 25135, "step": 190} +{"lr": 2.419900125425576e-05, "data_time": 0.7109977960586548, "loss": 0.08783520702272654, "time": 9.058099269866943, "iter": 200, "memory": 25126, "step": 200} +{"lr": 2.3588099559483543e-05, "data_time": 0.024530315399169923, "loss": 0.06702728094533086, "time": 8.314561104774475, "iter": 210, "memory": 25130, "step": 210} +{"lr": 2.2955223264108254e-05, "data_time": 0.026329755783081055, "loss": 0.07700718303676694, "time": 8.35518283843994, "iter": 220, "memory": 25135, "step": 220} +{"lr": 2.2301991725675243e-05, "data_time": 0.02602810859680176, "loss": 0.06646526856347919, "time": 8.364746284484863, "iter": 230, "memory": 25140, "step": 230} +{"lr": 2.163007638523606e-05, "data_time": 0.024690914154052734, "loss": 0.07641889140941202, "time": 8.481892514228822, "iter": 240, "memory": 25140, "step": 240} +{"lr": 2.094119649058736e-05, "data_time": 0.02742297649383545, "loss": 0.07606312427669763, "time": 8.40981936454773, "iter": 250, "memory": 25140, "step": 250} +{"lr": 2.0237114697185536e-05, "data_time": 0.7790374040603638, "loss": 0.05600544987246394, "time": 9.161308002471923, "iter": 260, "memory": 25126, "step": 260} +{"lr": 1.9519632557992884e-05, "data_time": 0.02528250217437744, "loss": 0.032392892776988445, "time": 8.278358697891235, "iter": 270, "memory": 25140, "step": 270} +{"lr": 1.8790585913795754e-05, "data_time": 0.024411749839782716, "loss": 0.031363090546801684, "time": 8.28447904586792, "iter": 280, "memory": 25145, "step": 280} +{"lr": 1.8051840195789513e-05, "data_time": 0.024938130378723146, "loss": 0.0178481632261537, "time": 8.433752274513244, "iter": 290, "memory": 25140, "step": 290} +{"lr": 1.7305285652449754e-05, "data_time": 0.024048709869384767, "loss": 0.029898300766944885, "time": 8.51099956035614, "iter": 300, "memory": 25140, "step": 300} +{"lr": 1.6552832512902796e-05, "data_time": 0.024897027015686034, "loss": 0.03636545571498573, "time": 8.167599749565124, "iter": 310, "memory": 25154, "step": 310} +{"lr": 1.579640609917124e-05, "data_time": 0.024878525733947755, "loss": 0.027858773327898233, "time": 8.258635640144348, "iter": 320, "memory": 25140, "step": 320} +{"lr": 1.5037941899800858e-05, "data_time": 0.5952034235000611, "loss": 0.003308120323345065, "time": 8.971233129501343, "iter": 330, "memory": 25159, "step": 330} +{"lr": 1.4279380617474167e-05, "data_time": 0.023882961273193358, "loss": 0.010263860644772648, "time": 8.390328741073608, "iter": 340, "memory": 25140, "step": 340} +{"lr": 1.3522663203282473e-05, "data_time": 0.025039005279541015, "loss": 0.009496044769184664, "time": 8.259875011444091, "iter": 350, "memory": 25145, "step": 350} +{"lr": 1.2769725890362214e-05, "data_time": 0.02414381504058838, "loss": 0.002247853419976309, "time": 8.220645809173584, "iter": 360, "memory": 25145, "step": 360} +{"lr": 1.2022495239603391e-05, "data_time": 0.02578558921813965, "loss": 0.012365863198647275, "time": 8.20867109298706, "iter": 370, "memory": 25149, "step": 370} +{"lr": 1.1282883210106502e-05, "data_time": 0.024384379386901855, "loss": 0.007487910037161783, "time": 8.383195972442627, "iter": 380, "memory": 25116, "step": 380} +{"lr": 1.0552782267001564e-05, "data_time": 0.6226678133010864, "loss": 0.006440980736806523, "time": 9.166763091087342, "iter": 390, "memory": 25130, "step": 390} +{"lr": 9.834060539146829e-06, "data_time": 0.02374246120452881, "loss": 0.0017188996760523878, "time": 8.481120586395264, "iter": 400, "memory": 25101, "step": 400} +{"lr": 9.128557039097413e-06, "data_time": 0.02455012798309326, "loss": 0.0011161714704940096, "time": 8.485801196098327, "iter": 410, "memory": 25140, "step": 410} +{"lr": 8.438076957574515e-06, "data_time": 0.02346017360687256, "loss": 0.0018305424688151105, "time": 8.208940386772156, "iter": 420, "memory": 25126, "step": 420} +{"lr": 7.764387044475588e-06, "data_time": 0.02561681270599365, "loss": 0.0015687520615756511, "time": 8.34629774093628, "iter": 430, "memory": 25140, "step": 430} +{"lr": 7.10921108824393e-06, "data_time": 0.027037811279296876, "loss": 0.0012698352773441001, "time": 8.291221165657044, "iter": 440, "memory": 25154, "step": 440} +{"lr": 6.474225505165039e-06, "data_time": 0.6254783630371094, "loss": 0.0058078222544281745, "time": 8.878330636024476, "iter": 450, "memory": 25154, "step": 450} +{"lr": 5.8610550498752785e-06, "data_time": 0.02725872993469238, "loss": 0.00038430026470450687, "time": 8.40397675037384, "iter": 460, "memory": 25130, "step": 460} +{"lr": 5.271268658058654e-06, "data_time": 0.029474282264709474, "loss": 0.0004050621733767912, "time": 8.591666626930238, "iter": 470, "memory": 25140, "step": 470} +{"lr": 4.7063754319689895e-06, "data_time": 0.02443506717681885, "loss": 0.0005315244881785475, "time": 8.468563723564149, "iter": 480, "memory": 25140, "step": 480} +{"lr": 4.167820779049542e-06, "data_time": 0.02865567207336426, "loss": 0.00040327069436898457, "time": 8.29795262813568, "iter": 490, "memory": 25145, "step": 490} +{"lr": 3.6569827135302208e-06, "data_time": 0.02969698905944824, "loss": 0.00029993579082656654, "time": 8.209134578704834, "iter": 500, "memory": 25174, "step": 500} +{"lr": 3.175168330465622e-06, "data_time": 0.031323790550231934, "loss": 0.0003912944899639115, "time": 8.242201948165894, "iter": 510, "memory": 25101, "step": 510} +{"lr": 2.7236104612358904e-06, "data_time": 0.6205220937728881, "loss": 0.0003439043284743093, "time": 8.738465499877929, "iter": 520, "memory": 25154, "step": 520} +{"lr": 2.303464519067985e-06, "data_time": 0.02376224994659424, "loss": 0.0003896821013768204, "time": 8.302059483528136, "iter": 530, "memory": 25130, "step": 530} +{"lr": 1.9158055426488924e-06, "data_time": 0.025845718383789063, "loss": 0.00031418252910953015, "time": 8.301901626586915, "iter": 540, "memory": 25145, "step": 540} +{"lr": 1.5616254453953114e-06, "data_time": 0.024128270149230958, "loss": 0.00038391390844481064, "time": 8.352638530731202, "iter": 550, "memory": 25140, "step": 550} +{"lr": 1.2418304774182065e-06, "data_time": 0.024004268646240234, "loss": 0.00029076010396238415, "time": 8.445630288124084, "iter": 560, "memory": 25130, "step": 560} +{"lr": 9.572389066763321e-07, "data_time": 0.02346205711364746, "loss": 0.0003445907204877585, "time": 8.332459616661072, "iter": 570, "memory": 25125, "step": 570} +{"lr": 7.085789252520916e-07, "data_time": 0.6720932722091675, "loss": 0.00029142090788809584, "time": 8.89841170310974, "iter": 580, "memory": 25140, "step": 580} +{"lr": 4.964867861069083e-07, "data_time": 0.023869991302490234, "loss": 0.0003569886481272988, "time": 8.328818225860596, "iter": 590, "memory": 25135, "step": 590} +{"lr": 3.2150517508373746e-07, "data_time": 0.027643156051635743, "loss": 0.00026720061869127675, "time": 8.196135878562927, "iter": 600, "memory": 25145, "step": 600} +{"lr": 1.8408182232222553e-07, "data_time": 0.024723362922668458, "loss": 0.0002880730258766562, "time": 8.228319907188416, "iter": 610, "memory": 25130, "step": 610} +{"lr": 8.456835663962096e-08, "data_time": 0.02482869625091553, "loss": 0.0002438773080939427, "time": 8.210890913009644, "iter": 620, "memory": 25135, "step": 620} +{"lr": 2.3219405808672077e-08, "data_time": 0.0250823974609375, "loss": 0.00031344871822511775, "time": 8.156480741500854, "iter": 630, "memory": 25159, "step": 630} +{"lr": 1.9194503473318726e-10, "data_time": 0.023398423194885255, "loss": 0.0004004927046480589, "time": 8.25168981552124, "iter": 640, "memory": 25145, "step": 640} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/config.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..1e03a56b0d09f5f7e3fe173adf5b8088953c1cea --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/config.py @@ -0,0 +1,139 @@ +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/scalars.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/scalars.json new file mode 100644 index 0000000000000000000000000000000000000000..ac0541074deca4ae6db74afc8566cde807705dcb --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/20241226_105245/vis_data/scalars.json @@ -0,0 +1,64 @@ +{"lr": 1.500015e-05, "data_time": 0.02441577911376953, "loss": 5.257675409317017, "time": 10.637859201431274, "iter": 10, "memory": 25149, "step": 10} +{"lr": 3e-05, "data_time": 0.03938922882080078, "loss": 2.692900276184082, "time": 8.636328339576721, "iter": 20, "memory": 25140, "step": 20} +{"lr": 2.99808095489134e-05, "data_time": 0.026666665077209474, "loss": 1.350151574611664, "time": 8.48230082988739, "iter": 30, "memory": 25140, "step": 30} +{"lr": 2.9923287298775314e-05, "data_time": 0.09188299179077149, "loss": 1.0310316979885101, "time": 8.40936803817749, "iter": 40, "memory": 25140, "step": 40} +{"lr": 2.9827580433309446e-05, "data_time": 0.030604100227355956, "loss": 1.015432846546173, "time": 8.392545104026794, "iter": 50, "memory": 25135, "step": 50} +{"lr": 2.9693933840238504e-05, "data_time": 0.024699020385742187, "loss": 0.9104578614234924, "time": 8.261106276512146, "iter": 60, "memory": 25135, "step": 60} +{"lr": 2.952268948468346e-05, "data_time": 0.5760255098342896, "loss": 0.6336249113082886, "time": 8.82242019176483, "iter": 70, "memory": 25159, "step": 70} +{"lr": 2.9314285534168186e-05, "data_time": 0.024132895469665527, "loss": 0.4324179649353027, "time": 8.290298128128052, "iter": 80, "memory": 25105, "step": 80} +{"lr": 2.90692552374685e-05, "data_time": 0.026720929145812988, "loss": 0.5336983978748322, "time": 8.383773064613342, "iter": 90, "memory": 25154, "step": 90} +{"lr": 2.8788225560174216e-05, "data_time": 0.024952125549316407, "loss": 0.43612043261528016, "time": 8.369651365280152, "iter": 100, "memory": 25154, "step": 100} +{"lr": 2.847191558045545e-05, "data_time": 0.035126781463623045, "loss": 0.464183883368969, "time": 8.43579168319702, "iter": 110, "memory": 25140, "step": 110} +{"lr": 2.8121134649138086e-05, "data_time": 0.029959464073181154, "loss": 0.36793762668967245, "time": 8.405464482307433, "iter": 120, "memory": 25140, "step": 120} +{"lr": 2.7736780318796056e-05, "data_time": 0.9768228530883789, "loss": 0.3529036447405815, "time": 9.261660981178284, "iter": 130, "memory": 25141, "step": 130} +{"lr": 2.7319836047159543e-05, "data_time": 0.024857091903686523, "loss": 0.1392025537788868, "time": 8.177789330482483, "iter": 140, "memory": 25145, "step": 140} +{"lr": 2.68713686807153e-05, "data_time": 0.02593729496002197, "loss": 0.1616042286157608, "time": 8.413124871253967, "iter": 150, "memory": 25140, "step": 150} +{"lr": 2.639252572493797e-05, "data_time": 0.06974592208862304, "loss": 0.17334721721708773, "time": 8.409547686576843, "iter": 160, "memory": 25135, "step": 160} +{"lr": 2.5884532408136998e-05, "data_time": 0.02381100654602051, "loss": 0.16655125971883536, "time": 8.332503461837769, "iter": 170, "memory": 25125, "step": 170} +{"lr": 2.534868854643217e-05, "data_time": 0.027215075492858887, "loss": 0.19725488470867275, "time": 8.315246486663819, "iter": 180, "memory": 25135, "step": 180} +{"lr": 2.4786365217879254e-05, "data_time": 0.028009748458862303, "loss": 0.14739922918379306, "time": 8.278998947143554, "iter": 190, "memory": 25135, "step": 190} +{"lr": 2.419900125425576e-05, "data_time": 0.7109977960586548, "loss": 0.08783520702272654, "time": 9.058099269866943, "iter": 200, "memory": 25126, "step": 200} +{"lr": 2.3588099559483543e-05, "data_time": 0.024530315399169923, "loss": 0.06702728094533086, "time": 8.314561104774475, "iter": 210, "memory": 25130, "step": 210} +{"lr": 2.2955223264108254e-05, "data_time": 0.026329755783081055, "loss": 0.07700718303676694, "time": 8.35518283843994, "iter": 220, "memory": 25135, "step": 220} +{"lr": 2.2301991725675243e-05, "data_time": 0.02602810859680176, "loss": 0.06646526856347919, "time": 8.364746284484863, "iter": 230, "memory": 25140, "step": 230} +{"lr": 2.163007638523606e-05, "data_time": 0.024690914154052734, "loss": 0.07641889140941202, "time": 8.481892514228822, "iter": 240, "memory": 25140, "step": 240} +{"lr": 2.094119649058736e-05, "data_time": 0.02742297649383545, "loss": 0.07606312427669763, "time": 8.40981936454773, "iter": 250, "memory": 25140, "step": 250} +{"lr": 2.0237114697185536e-05, "data_time": 0.7790374040603638, "loss": 0.05600544987246394, "time": 9.161308002471923, "iter": 260, "memory": 25126, "step": 260} +{"lr": 1.9519632557992884e-05, "data_time": 0.02528250217437744, "loss": 0.032392892776988445, "time": 8.278358697891235, "iter": 270, "memory": 25140, "step": 270} +{"lr": 1.8790585913795754e-05, "data_time": 0.024411749839782716, "loss": 0.031363090546801684, "time": 8.28447904586792, "iter": 280, "memory": 25145, "step": 280} +{"lr": 1.8051840195789513e-05, "data_time": 0.024938130378723146, "loss": 0.0178481632261537, "time": 8.433752274513244, "iter": 290, "memory": 25140, "step": 290} +{"lr": 1.7305285652449754e-05, "data_time": 0.024048709869384767, "loss": 0.029898300766944885, "time": 8.51099956035614, "iter": 300, "memory": 25140, "step": 300} +{"lr": 1.6552832512902796e-05, "data_time": 0.024897027015686034, "loss": 0.03636545571498573, "time": 8.167599749565124, "iter": 310, "memory": 25154, "step": 310} +{"lr": 1.579640609917124e-05, "data_time": 0.024878525733947755, "loss": 0.027858773327898233, "time": 8.258635640144348, "iter": 320, "memory": 25140, "step": 320} +{"lr": 1.5037941899800858e-05, "data_time": 0.5952034235000611, "loss": 0.003308120323345065, "time": 8.971233129501343, "iter": 330, "memory": 25159, "step": 330} +{"lr": 1.4279380617474167e-05, "data_time": 0.023882961273193358, "loss": 0.010263860644772648, "time": 8.390328741073608, "iter": 340, "memory": 25140, "step": 340} +{"lr": 1.3522663203282473e-05, "data_time": 0.025039005279541015, "loss": 0.009496044769184664, "time": 8.259875011444091, "iter": 350, "memory": 25145, "step": 350} +{"lr": 1.2769725890362214e-05, "data_time": 0.02414381504058838, "loss": 0.002247853419976309, "time": 8.220645809173584, "iter": 360, "memory": 25145, "step": 360} +{"lr": 1.2022495239603391e-05, "data_time": 0.02578558921813965, "loss": 0.012365863198647275, "time": 8.20867109298706, "iter": 370, "memory": 25149, "step": 370} +{"lr": 1.1282883210106502e-05, "data_time": 0.024384379386901855, "loss": 0.007487910037161783, "time": 8.383195972442627, "iter": 380, "memory": 25116, "step": 380} +{"lr": 1.0552782267001564e-05, "data_time": 0.6226678133010864, "loss": 0.006440980736806523, "time": 9.166763091087342, "iter": 390, "memory": 25130, "step": 390} +{"lr": 9.834060539146829e-06, "data_time": 0.02374246120452881, "loss": 0.0017188996760523878, "time": 8.481120586395264, "iter": 400, "memory": 25101, "step": 400} +{"lr": 9.128557039097413e-06, "data_time": 0.02455012798309326, "loss": 0.0011161714704940096, "time": 8.485801196098327, "iter": 410, "memory": 25140, "step": 410} +{"lr": 8.438076957574515e-06, "data_time": 0.02346017360687256, "loss": 0.0018305424688151105, "time": 8.208940386772156, "iter": 420, "memory": 25126, "step": 420} +{"lr": 7.764387044475588e-06, "data_time": 0.02561681270599365, "loss": 0.0015687520615756511, "time": 8.34629774093628, "iter": 430, "memory": 25140, "step": 430} +{"lr": 7.10921108824393e-06, "data_time": 0.027037811279296876, "loss": 0.0012698352773441001, "time": 8.291221165657044, "iter": 440, "memory": 25154, "step": 440} +{"lr": 6.474225505165039e-06, "data_time": 0.6254783630371094, "loss": 0.0058078222544281745, "time": 8.878330636024476, "iter": 450, "memory": 25154, "step": 450} +{"lr": 5.8610550498752785e-06, "data_time": 0.02725872993469238, "loss": 0.00038430026470450687, "time": 8.40397675037384, "iter": 460, "memory": 25130, "step": 460} +{"lr": 5.271268658058654e-06, "data_time": 0.029474282264709474, "loss": 0.0004050621733767912, "time": 8.591666626930238, "iter": 470, "memory": 25140, "step": 470} +{"lr": 4.7063754319689895e-06, "data_time": 0.02443506717681885, "loss": 0.0005315244881785475, "time": 8.468563723564149, "iter": 480, "memory": 25140, "step": 480} +{"lr": 4.167820779049542e-06, "data_time": 0.02865567207336426, "loss": 0.00040327069436898457, "time": 8.29795262813568, "iter": 490, "memory": 25145, "step": 490} +{"lr": 3.6569827135302208e-06, "data_time": 0.02969698905944824, "loss": 0.00029993579082656654, "time": 8.209134578704834, "iter": 500, "memory": 25174, "step": 500} +{"lr": 3.175168330465622e-06, "data_time": 0.031323790550231934, "loss": 0.0003912944899639115, "time": 8.242201948165894, "iter": 510, "memory": 25101, "step": 510} +{"lr": 2.7236104612358904e-06, "data_time": 0.6205220937728881, "loss": 0.0003439043284743093, "time": 8.738465499877929, "iter": 520, "memory": 25154, "step": 520} +{"lr": 2.303464519067985e-06, "data_time": 0.02376224994659424, "loss": 0.0003896821013768204, "time": 8.302059483528136, "iter": 530, "memory": 25130, "step": 530} +{"lr": 1.9158055426488924e-06, "data_time": 0.025845718383789063, "loss": 0.00031418252910953015, "time": 8.301901626586915, "iter": 540, "memory": 25145, "step": 540} +{"lr": 1.5616254453953114e-06, "data_time": 0.024128270149230958, "loss": 0.00038391390844481064, "time": 8.352638530731202, "iter": 550, "memory": 25140, "step": 550} +{"lr": 1.2418304774182065e-06, "data_time": 0.024004268646240234, "loss": 0.00029076010396238415, "time": 8.445630288124084, "iter": 560, "memory": 25130, "step": 560} +{"lr": 9.572389066763321e-07, "data_time": 0.02346205711364746, "loss": 0.0003445907204877585, "time": 8.332459616661072, "iter": 570, "memory": 25125, "step": 570} +{"lr": 7.085789252520916e-07, "data_time": 0.6720932722091675, "loss": 0.00029142090788809584, "time": 8.89841170310974, "iter": 580, "memory": 25140, "step": 580} +{"lr": 4.964867861069083e-07, "data_time": 0.023869991302490234, "loss": 0.0003569886481272988, "time": 8.328818225860596, "iter": 590, "memory": 25135, "step": 590} +{"lr": 3.2150517508373746e-07, "data_time": 0.027643156051635743, "loss": 0.00026720061869127675, "time": 8.196135878562927, "iter": 600, "memory": 25145, "step": 600} +{"lr": 1.8408182232222553e-07, "data_time": 0.024723362922668458, "loss": 0.0002880730258766562, "time": 8.228319907188416, "iter": 610, "memory": 25130, "step": 610} +{"lr": 8.456835663962096e-08, "data_time": 0.02482869625091553, "loss": 0.0002438773080939427, "time": 8.210890913009644, "iter": 620, "memory": 25135, "step": 620} +{"lr": 2.3219405808672077e-08, "data_time": 0.0250823974609375, "loss": 0.00031344871822511775, "time": 8.156480741500854, "iter": 630, "memory": 25159, "step": 630} +{"lr": 1.9194503473318726e-10, "data_time": 0.023398423194885255, "loss": 0.0004004927046480589, "time": 8.25168981552124, "iter": 640, "memory": 25145, "step": 640} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/internvl_v2_internlm2_2b_lora_finetune_food.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/internvl_v2_internlm2_2b_lora_finetune_food.py new file mode 100644 index 0000000000000000000000000000000000000000..1e03a56b0d09f5f7e3fe173adf5b8088953c1cea --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/internvl_v2_internlm2_2b_lora_finetune_food.py @@ -0,0 +1,139 @@ +accumulative_counts = 2 +batch_size = 4 +betas = ( + 0.9, + 0.999, +) +custom_hooks = [ + dict( + tokenizer=dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained'), + type='xtuner.engine.hooks.DatasetInfoHook'), +] +data_path = '/root/share/datasets/FoodieQA/sivqa_llava.json' +data_root = '/root/share/datasets/FoodieQA/' +dataloader_num_workers = 4 +default_hooks = dict( + checkpoint=dict( + by_epoch=False, + interval=64, + max_keep_ckpts=-1, + save_optimizer=False, + type='mmengine.hooks.CheckpointHook'), + logger=dict( + interval=10, + log_metric_by_epoch=False, + type='mmengine.hooks.LoggerHook'), + param_scheduler=dict(type='mmengine.hooks.ParamSchedulerHook'), + sampler_seed=dict(type='mmengine.hooks.DistSamplerSeedHook'), + timer=dict(type='mmengine.hooks.IterTimerHook')) +env_cfg = dict( + cudnn_benchmark=False, + dist_cfg=dict(backend='nccl'), + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0)) +image_folder = '/root/share/datasets/FoodieQA/' +launcher = 'none' +llava_dataset = dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset') +load_from = None +log_level = 'INFO' +log_processor = dict(by_epoch=False) +lr = 3e-05 +max_epochs = 10 +max_length = 8192 +max_norm = 1 +model = dict( + freeze_llm=True, + freeze_visual_encoder=True, + llm_lora=dict( + lora_alpha=256, + lora_dropout=0.05, + r=128, + target_modules=None, + task_type='CAUSAL_LM', + type='peft.LoraConfig'), + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + type='xtuner.model.InternVL_V1_5') +optim_type = 'torch.optim.AdamW' +optim_wrapper = dict( + optimizer=dict( + betas=( + 0.9, + 0.999, + ), + lr=3e-05, + type='torch.optim.AdamW', + weight_decay=0.05), + type='DeepSpeedOptimWrapper') +param_scheduler = [ + dict( + begin=0, + by_epoch=True, + convert_to_iter_based=True, + end=0.3, + start_factor=1e-05, + type='mmengine.optim.LinearLR'), + dict( + begin=0.3, + by_epoch=True, + convert_to_iter_based=True, + end=10, + eta_min=0.0, + type='mmengine.optim.CosineAnnealingLR'), +] +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' +prompt_template = 'xtuner.utils.PROMPT_TEMPLATE.internlm2_chat' +randomness = dict(deterministic=False, seed=None) +resume = False +runner_type = 'FlexibleRunner' +save_steps = 64 +save_total_limit = -1 +strategy = dict( + config=dict( + bf16=dict(enabled=True), + fp16=dict(enabled=False, initial_scale_power=16), + gradient_accumulation_steps='auto', + gradient_clipping='auto', + train_micro_batch_size_per_gpu='auto', + zero_allow_untested_optimizer=True, + zero_force_ds_cpu_optimizer=False, + zero_optimization=dict(overlap_comm=True, stage=2)), + exclude_frozen_parameters=True, + gradient_accumulation_steps=2, + gradient_clipping=1, + sequence_parallel_size=1, + train_micro_batch_size_per_gpu=4, + type='xtuner.engine.DeepSpeedStrategy') +tokenizer = dict( + pretrained_model_name_or_path= + '/root/share/new_models/OpenGVLab/InternVL2-2B', + trust_remote_code=True, + type='transformers.AutoTokenizer.from_pretrained') +train_cfg = dict(max_epochs=10, type='xtuner.engine.runner.TrainLoop') +train_dataloader = dict( + batch_size=4, + collate_fn=dict(type='xtuner.dataset.collate_fns.default_collate_fn'), + dataset=dict( + data_paths='/root/share/datasets/FoodieQA/sivqa_llava.json', + image_folders='/root/share/datasets/FoodieQA/', + max_length=8192, + model_path='/root/share/new_models/OpenGVLab/InternVL2-2B', + template='xtuner.utils.PROMPT_TEMPLATE.internlm2_chat', + type='xtuner.dataset.InternVL_V1_5_Dataset'), + num_workers=4, + sampler=dict( + length_property='modality_length', + per_device_batch_size=8, + type='xtuner.dataset.samplers.LengthGroupedSampler')) +visualizer = None +warmup_ratio = 0.03 +weight_decay = 0.05 +work_dir = './work_dirs/internvl_v2_internlm2_2b_lora_finetune_food' diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_128.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_128.pth new file mode 100644 index 0000000000000000000000000000000000000000..9a6a649892bd1ce16d06f15745c00b0b04656772 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_128.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98e8588a9b8c44351c22836b3e3dab1203175007f77ed9270b4130d3e7948e14 +size 301178754 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_192.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_192.pth new file mode 100644 index 0000000000000000000000000000000000000000..159da2661317ac7fe19406d8f92a6a4bc4383011 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_192.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9efc61d2a2e2ec1720c5484b68bfed2ff9b1e18ed29cc867091683f7210f9656 +size 301183490 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_256.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_256.pth new file mode 100644 index 0000000000000000000000000000000000000000..052a1947bc61c48453ecd8a35c18c2cfa43ee08c --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_256.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:867b3918646ac33d86cec9431a4672a2584ea25884aa944b30f39012ca97baf0 +size 301188226 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_320.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_320.pth new file mode 100644 index 0000000000000000000000000000000000000000..1851c95bc9d9506710c10e517767da671517e152 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_320.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f951bbf33457fcdde8b27ce587ded453e971799aa8f2031d05423aad71c60ce6 +size 301192962 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_384.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_384.pth new file mode 100644 index 0000000000000000000000000000000000000000..4ab748956db47018eda56c82771301f404a95f93 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_384.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f84ff5f36983ff0d068808b226d10a2f401f104baace9b8fafe18032f5fc96 +size 301197634 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_448.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_448.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba2310d6456d9055075d01fff9a66c59b8230e79 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_448.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47bac29617eec747db94fbfb17c7d6f62276efc6f8389b63f5fde7359430c186 +size 301202306 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_512.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_512.pth new file mode 100644 index 0000000000000000000000000000000000000000..062760ecc9640674ddf8de7e155e001ed9cde1bc --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a14c00540353647fbe7fcfce3308d8a878b8d83640eeb37f76de2adba18302c0 +size 301206978 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_576.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_576.pth new file mode 100644 index 0000000000000000000000000000000000000000..93ce7bc93f7660deaa9cb3b2f3dc8bd705171778 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_576.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6199011455ec40b5733d21d3940779b82b4c3a34fa939c0ed5d17bdb09207390 +size 301211650 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_64.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_64.pth new file mode 100644 index 0000000000000000000000000000000000000000..32979c03749fafcac2e6855d6290d2ae69c876e0 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_64.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d83779661adcbef092709fe40c0e2d68759f75ae7d281f8f8b297d046b11cc0 +size 301174018 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_640.pth b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_640.pth new file mode 100644 index 0000000000000000000000000000000000000000..888779a14b085fe4160c9c2d1c8c563b6d16c7e0 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_640.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ef52dc7217abb84fe946105873e0fe7649de4eed91594bc6b9aa3e5073bc24 +size 301216322 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/last_checkpoint b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/last_checkpoint new file mode 100644 index 0000000000000000000000000000000000000000..97c2f6be9e15621d83429ead8e174027aefbe461 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/last_checkpoint @@ -0,0 +1 @@ +/root/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/iter_640.pth \ No newline at end of file diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/added_tokens.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..35f5893c8e29d6102945a953529819a2d56c62a9 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/added_tokens.json @@ -0,0 +1,11 @@ +{ + "": 92552, + "": 92545, + "": 92548, + "": 92550, + "": 92546, + "": 92551, + "": 92544, + "": 92547, + "": 92549 +} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/config.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec19e22b18d899c19f8ac5c82cc8df8d6ca42987 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/config.json @@ -0,0 +1,199 @@ +{ + "_commit_hash": null, + "_name_or_path": "/root/share/new_models/OpenGVLab/InternVL2-2B", + "architectures": [ + "InternVLChatModel" + ], + "auto_map": { + "AutoConfig": "configuration_internvl_chat.InternVLChatConfig", + "AutoModel": "modeling_internvl_chat.InternVLChatModel", + "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel" + }, + "downsample_ratio": 0.5, + "dynamic_image_size": true, + "force_image_size": 448, + "llm_config": { + "_name_or_path": "internlm/internlm2-chat-1_8b", + "add_cross_attention": false, + "architectures": [ + "InternLM2ForCausalLM" + ], + "attn_implementation": "eager", + "auto_map": { + "AutoConfig": "configuration_internlm2.InternLM2Config", + "AutoModel": "modeling_internlm2.InternLM2ForCausalLM", + "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM" + }, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bias": false, + "bos_token_id": 1, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 2, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "silu", + "hidden_size": 2048, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "length_penalty": 1.0, + "max_length": 20, + "max_position_embeddings": 32768, + "min_length": 0, + "model_type": "internlm2", + "no_repeat_ngram_size": 0, + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_hidden_layers": 24, + "num_key_value_heads": 8, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": 2, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 2.0, + "type": "dynamic" + }, + "rope_theta": 1000000, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": false, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.39.0", + "typical_p": 1.0, + "use_bfloat16": true, + "use_cache": true, + "vocab_size": 92553 + }, + "max_dynamic_patch": 12, + "min_dynamic_patch": 1, + "model_type": "internvl_chat", + "ps_version": "v2", + "select_layer": -1, + "template": "internlm2-chat", + "torch_dtype": "bfloat16", + "transformers_version": null, + "use_backbone_lora": 0, + "use_llm_lora": 0, + "use_thumbnail": true, + "vision_config": { + "_name_or_path": "", + "add_cross_attention": false, + "architectures": [ + "InternVisionModel" + ], + "attention_dropout": 0.0, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "bos_token_id": null, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": null, + "decoder_start_token_id": null, + "diversity_penalty": 0.0, + "do_sample": false, + "drop_path_rate": 0.0, + "dropout": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": null, + "exponential_decay_length_penalty": null, + "finetuning_task": null, + "forced_bos_token_id": null, + "forced_eos_token_id": null, + "hidden_act": "gelu", + "hidden_size": 1024, + "id2label": { + "0": "LABEL_0", + "1": "LABEL_1" + }, + "image_size": 448, + "initializer_factor": 1.0, + "initializer_range": 0.02, + "intermediate_size": 4096, + "is_decoder": false, + "is_encoder_decoder": false, + "label2id": { + "LABEL_0": 0, + "LABEL_1": 1 + }, + "layer_norm_eps": 1e-06, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "intern_vit_6b", + "no_repeat_ngram_size": 0, + "norm_type": "layer_norm", + "num_attention_heads": 16, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": 3, + "num_hidden_layers": 24, + "num_return_sequences": 1, + "output_attentions": false, + "output_hidden_states": false, + "output_scores": false, + "pad_token_id": null, + "patch_size": 14, + "prefix": null, + "problem_type": null, + "pruned_heads": {}, + "qk_normalization": false, + "qkv_bias": true, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict": true, + "return_dict_in_generate": false, + "sep_token_id": null, + "suppress_tokens": null, + "task_specific_params": null, + "temperature": 1.0, + "tf_legacy_loss": false, + "tie_encoder_decoder": false, + "tie_word_embeddings": true, + "tokenizer_class": null, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": "bfloat16", + "torchscript": false, + "transformers_version": "4.39.0", + "typical_p": 1.0, + "use_bfloat16": true, + "use_flash_attn": true + } +} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_intern_vit.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_intern_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..ac60112c79abc35627a5b6b58e760c2f78e71839 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_intern_vit.py @@ -0,0 +1,119 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +import os +from typing import Union + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + + +class InternVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to + instantiate a vision encoder according to the specified arguments, defining the model architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + Number of color channels in the input images (e.g., 3 for RGB). + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + qkv_bias (`bool`, *optional*, defaults to `False`): + Whether to add a bias to the queries and values in the self-attention layers. + hidden_size (`int`, *optional*, defaults to 3200): + Dimensionality of the encoder layers and the pooler layer. + num_attention_heads (`int`, *optional*, defaults to 25): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 12800): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + qk_normalization (`bool`, *optional*, defaults to `True`): + Whether to normalize the queries and keys in the self-attention layers. + num_hidden_layers (`int`, *optional*, defaults to 48): + Number of hidden layers in the Transformer encoder. + use_flash_attn (`bool`, *optional*, defaults to `True`): + Whether to use flash attention mechanism. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-6): + The epsilon used by the layer normalization layers. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + drop_path_rate (`float`, *optional*, defaults to 0.0): + Dropout rate for stochastic depth. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 0.1): + A factor for layer scale. + """ + + model_type = 'intern_vit_6b' + + def __init__( + self, + num_channels=3, + patch_size=14, + image_size=224, + qkv_bias=False, + hidden_size=3200, + num_attention_heads=25, + intermediate_size=12800, + qk_normalization=True, + num_hidden_layers=48, + use_flash_attn=True, + hidden_act='gelu', + norm_type='rms_norm', + layer_norm_eps=1e-6, + dropout=0.0, + drop_path_rate=0.0, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=0.1, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.dropout = dropout + self.drop_path_rate = drop_path_rate + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.norm_type = norm_type + self.qkv_bias = qkv_bias + self.qk_normalization = qk_normalization + self.use_flash_attn = use_flash_attn + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig': + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + if 'vision_config' in config_dict: + config_dict = config_dict['vision_config'] + + if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.' + ) + + return cls.from_dict(config_dict, **kwargs) diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_internlm2.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..282b13b1e2066ecc074ecae87b35a19d251f0ed7 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_internlm2.py @@ -0,0 +1,150 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/configuration_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" InternLM2 model configuration""" + +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {} + + +# Modified from transformers.model.llama.configuration_llama.LlamaConfig +class InternLM2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate + an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the InternLM2-7B. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`InternLM2Model`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings(`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + Example: + + """ + model_type = 'internlm2' + _auto_class = 'AutoConfig' + + def __init__( # pylint: disable=W0102 + self, + vocab_size=103168, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act='silu', + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + bias=True, + rope_theta=10000, + rope_scaling=None, + attn_implementation='eager', + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.bias = bias + + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + + self.attn_implementation = attn_implementation + if self.attn_implementation is None: + self.attn_implementation = 'eager' + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + '`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, ' + f'got {self.rope_scaling}' + ) + rope_scaling_type = self.rope_scaling.get('type', None) + rope_scaling_factor = self.rope_scaling.get('factor', None) + if rope_scaling_type is None or rope_scaling_type not in ['linear', 'dynamic']: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor < 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float >= 1, got {rope_scaling_factor}") diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_internvl_chat.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_internvl_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..b5a518b7883535e2038fcd2d2fdd32f3c14da5ee --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/configuration_internvl_chat.py @@ -0,0 +1,96 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- + +import copy + +from transformers import AutoConfig, LlamaConfig +from transformers.configuration_utils import PretrainedConfig +from transformers.utils import logging + +from .configuration_intern_vit import InternVisionConfig +from .configuration_internlm2 import InternLM2Config + +logger = logging.get_logger(__name__) + + +class InternVLChatConfig(PretrainedConfig): + model_type = 'internvl_chat' + is_composition = True + + def __init__( + self, + vision_config=None, + llm_config=None, + use_backbone_lora=0, + use_llm_lora=0, + select_layer=-1, + force_image_size=None, + downsample_ratio=0.5, + template=None, + dynamic_image_size=False, + use_thumbnail=False, + ps_version='v1', + min_dynamic_patch=1, + max_dynamic_patch=6, + **kwargs): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info('vision_config is None. Initializing the InternVisionConfig with default values.') + + if llm_config is None: + llm_config = {} + logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).') + + self.vision_config = InternVisionConfig(**vision_config) + if llm_config['architectures'][0] == 'LlamaForCausalLM': + self.llm_config = LlamaConfig(**llm_config) + elif llm_config['architectures'][0] == 'InternLM2ForCausalLM': + self.llm_config = InternLM2Config(**llm_config) + else: + raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0])) + self.use_backbone_lora = use_backbone_lora + self.use_llm_lora = use_llm_lora + self.select_layer = select_layer + self.force_image_size = force_image_size + self.downsample_ratio = downsample_ratio + self.template = template + self.dynamic_image_size = dynamic_image_size + self.use_thumbnail = use_thumbnail + self.ps_version = ps_version # pixel shuffle version + self.min_dynamic_patch = min_dynamic_patch + self.max_dynamic_patch = max_dynamic_patch + + logger.info(f'vision_select_layer: {self.select_layer}') + logger.info(f'ps_version: {self.ps_version}') + logger.info(f'min_dynamic_patch: {self.min_dynamic_patch}') + logger.info(f'max_dynamic_patch: {self.max_dynamic_patch}') + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + output['vision_config'] = self.vision_config.to_dict() + output['llm_config'] = self.llm_config.to_dict() + output['model_type'] = self.__class__.model_type + output['use_backbone_lora'] = self.use_backbone_lora + output['use_llm_lora'] = self.use_llm_lora + output['select_layer'] = self.select_layer + output['force_image_size'] = self.force_image_size + output['downsample_ratio'] = self.downsample_ratio + output['template'] = self.template + output['dynamic_image_size'] = self.dynamic_image_size + output['use_thumbnail'] = self.use_thumbnail + output['ps_version'] = self.ps_version + output['min_dynamic_patch'] = self.min_dynamic_patch + output['max_dynamic_patch'] = self.max_dynamic_patch + + return output diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/conversation.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/conversation.py new file mode 100644 index 0000000000000000000000000000000000000000..644478288ec64fa19f621624d51dc00c08c5e975 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/conversation.py @@ -0,0 +1,393 @@ +""" +Conversation prompt templates. + +We kindly request that you import fastchat instead of copying this file if you wish to use it. +If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates. +""" + +import dataclasses +from enum import IntEnum, auto +from typing import Any, Dict, List, Tuple, Union + + +class SeparatorStyle(IntEnum): + """Separator styles.""" + + ADD_COLON_SINGLE = auto() + ADD_COLON_TWO = auto() + ADD_COLON_SPACE_SINGLE = auto() + NO_COLON_SINGLE = auto() + NO_COLON_TWO = auto() + ADD_NEW_LINE_SINGLE = auto() + LLAMA2 = auto() + CHATGLM = auto() + CHATML = auto() + CHATINTERN = auto() + DOLLY = auto() + RWKV = auto() + PHOENIX = auto() + ROBIN = auto() + FALCON_CHAT = auto() + CHATGLM3 = auto() + INTERNVL_ZH = auto() + MPT = auto() + + +@dataclasses.dataclass +class Conversation: + """A class that manages prompt templates and keeps all conversation history.""" + + # The name of this template + name: str + # The template of the system prompt + system_template: str = '{system_message}' + # The system message + system_message: str = '' + # The names of two roles + roles: Tuple[str] = ('USER', 'ASSISTANT') + # All messages. Each item is (role, message). + messages: List[List[str]] = () + # The number of few shot examples + offset: int = 0 + # The separator style and configurations + sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE + sep: str = '\n' + sep2: str = None + # Stop criteria (the default one is EOS token) + stop_str: Union[str, List[str]] = None + # Stops generation if meeting any token in this list + stop_token_ids: List[int] = None + + def get_prompt(self) -> str: + """Get the prompt for generation.""" + system_prompt = self.system_template.format(system_message=self.system_message) + if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ': ' + message + self.sep + else: + ret += role + ':' + return ret + elif self.sep_style == SeparatorStyle.ADD_COLON_TWO: + seps = [self.sep, self.sep2] + ret = system_prompt + seps[0] + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ': ' + message + seps[i % 2] + else: + ret += role + ':' + return ret + elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ': ' + message + self.sep + else: + ret += role + ': ' # must be end with a space + return ret + elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE: + ret = '' if system_prompt == '' else system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + '\n' + message + self.sep + else: + ret += role + '\n' + return ret + elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE: + ret = system_prompt + for role, message in self.messages: + if message: + ret += role + message + self.sep + else: + ret += role + return ret + elif self.sep_style == SeparatorStyle.NO_COLON_TWO: + seps = [self.sep, self.sep2] + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + message + seps[i % 2] + else: + ret += role + return ret + elif self.sep_style == SeparatorStyle.RWKV: + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + if message: + ret += ( + role + + ': ' + + message.replace('\r\n', '\n').replace('\n\n', '\n') + ) + ret += '\n\n' + else: + ret += role + ':' + return ret + elif self.sep_style == SeparatorStyle.LLAMA2: + seps = [self.sep, self.sep2] + if self.system_message: + ret = system_prompt + else: + ret = '[INST] ' + for i, (role, message) in enumerate(self.messages): + tag = self.roles[i % 2] + if message: + if i == 0: + ret += message + ' ' + else: + ret += tag + ' ' + message + seps[i % 2] + else: + ret += tag + return ret + elif self.sep_style == SeparatorStyle.CHATGLM: + # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308 + # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926 + round_add_n = 1 if self.name == 'chatglm2' else 0 + if system_prompt: + ret = system_prompt + self.sep + else: + ret = '' + + for i, (role, message) in enumerate(self.messages): + if i % 2 == 0: + ret += f'[Round {i//2 + round_add_n}]{self.sep}' + + if message: + ret += f'{role}:{message}{self.sep}' + else: + ret += f'{role}:' + return ret + elif self.sep_style == SeparatorStyle.CHATML: + ret = '' if system_prompt == '' else system_prompt + self.sep + '\n' + for role, message in self.messages: + if message: + ret += role + '\n' + message + self.sep + '\n' + else: + ret += role + '\n' + return ret + elif self.sep_style == SeparatorStyle.CHATGLM3: + ret = '' + if self.system_message: + ret += system_prompt + for role, message in self.messages: + if message: + ret += role + '\n' + ' ' + message + else: + ret += role + return ret + elif self.sep_style == SeparatorStyle.CHATINTERN: + # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771 + seps = [self.sep, self.sep2] + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + # if i % 2 == 0: + # ret += "" + if message: + ret += role + ':' + message + seps[i % 2] + '\n' + else: + ret += role + ':' + return ret + elif self.sep_style == SeparatorStyle.DOLLY: + seps = [self.sep, self.sep2] + ret = system_prompt + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ':\n' + message + seps[i % 2] + if i % 2 == 1: + ret += '\n\n' + else: + ret += role + ':\n' + return ret + elif self.sep_style == SeparatorStyle.PHOENIX: + ret = system_prompt + for role, message in self.messages: + if message: + ret += role + ': ' + '' + message + '' + else: + ret += role + ': ' + '' + return ret + elif self.sep_style == SeparatorStyle.ROBIN: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ':\n' + message + self.sep + else: + ret += role + ':\n' + return ret + elif self.sep_style == SeparatorStyle.FALCON_CHAT: + ret = '' + if self.system_message: + ret += system_prompt + self.sep + for role, message in self.messages: + if message: + ret += role + ': ' + message + self.sep + else: + ret += role + ':' + + return ret + elif self.sep_style == SeparatorStyle.INTERNVL_ZH: + seps = [self.sep, self.sep2] + ret = self.system_message + seps[0] + for i, (role, message) in enumerate(self.messages): + if message: + ret += role + ': ' + message + seps[i % 2] + else: + ret += role + ':' + return ret + elif self.sep_style == SeparatorStyle.MPT: + ret = system_prompt + self.sep + for role, message in self.messages: + if message: + if type(message) is tuple: + message, _, _ = message + ret += role + message + self.sep + else: + ret += role + return ret + else: + raise ValueError(f'Invalid style: {self.sep_style}') + + def set_system_message(self, system_message: str): + """Set the system message.""" + self.system_message = system_message + + def append_message(self, role: str, message: str): + """Append a new message.""" + self.messages.append([role, message]) + + def update_last_message(self, message: str): + """Update the last output. + + The last message is typically set to be None when constructing the prompt, + so we need to update it in-place after getting the response from a model. + """ + self.messages[-1][1] = message + + def to_gradio_chatbot(self): + """Convert the conversation to gradio chatbot format.""" + ret = [] + for i, (role, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + ret.append([msg, None]) + else: + ret[-1][-1] = msg + return ret + + def to_openai_api_messages(self): + """Convert the conversation to OpenAI chat completion format.""" + ret = [{'role': 'system', 'content': self.system_message}] + + for i, (_, msg) in enumerate(self.messages[self.offset :]): + if i % 2 == 0: + ret.append({'role': 'user', 'content': msg}) + else: + if msg is not None: + ret.append({'role': 'assistant', 'content': msg}) + return ret + + def copy(self): + return Conversation( + name=self.name, + system_template=self.system_template, + system_message=self.system_message, + roles=self.roles, + messages=[[x, y] for x, y in self.messages], + offset=self.offset, + sep_style=self.sep_style, + sep=self.sep, + sep2=self.sep2, + stop_str=self.stop_str, + stop_token_ids=self.stop_token_ids, + ) + + def dict(self): + return { + 'template_name': self.name, + 'system_message': self.system_message, + 'roles': self.roles, + 'messages': self.messages, + 'offset': self.offset, + } + + +# A global registry for all conversation templates +conv_templates: Dict[str, Conversation] = {} + + +def register_conv_template(template: Conversation, override: bool = False): + """Register a new conversation template.""" + if not override: + assert ( + template.name not in conv_templates + ), f'{template.name} has been registered.' + + conv_templates[template.name] = template + + +def get_conv_template(name: str) -> Conversation: + """Get a conversation template.""" + return conv_templates[name].copy() + + +# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference +# is that during training, the preprocessing function for the Hermes-2 template doesn't add +# at the beginning of the tokenized sequence, while the internlm2-chat template does. +# Therefore, they are completely equivalent during inference. +register_conv_template( + Conversation( + name='Hermes-2', + system_template='<|im_start|>system\n{system_message}', + # note: The new system prompt was not used here to avoid changes in benchmark performance. + # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。', + system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。', + roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), + sep_style=SeparatorStyle.MPT, + sep='<|im_end|>', + stop_token_ids=[ + 2, + 6, + 7, + 8, + ], + stop_str='<|endoftext|>', + ) +) + + +register_conv_template( + Conversation( + name='internlm2-chat', + system_template='<|im_start|>system\n{system_message}', + # note: The new system prompt was not used here to avoid changes in benchmark performance. + # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。', + system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。', + roles=('<|im_start|>user\n', '<|im_start|>assistant\n'), + sep_style=SeparatorStyle.MPT, + sep='<|im_end|>', + stop_token_ids=[ + 2, + 92543, + 92542 + ] + ) +) + + +register_conv_template( + Conversation( + name='phi3-chat', + system_template='<|system|>\n{system_message}', + # note: The new system prompt was not used here to avoid changes in benchmark performance. + # system_message='我是书生·万象,英文名是InternVL,是由上海人工智能实验室及多家合作单位联合开发的多模态大语言模型。', + system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。', + roles=('<|user|>\n', '<|assistant|>\n'), + sep_style=SeparatorStyle.MPT, + sep='<|end|>', + stop_token_ids=[ + 2, + 32000, + 32007 + ] + ) +) diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/generation_config.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..730945539a03baf7ab5ba19260491a02b9393cb3 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/generation_config.json @@ -0,0 +1,4 @@ +{ + "_from_model_config": true, + "transformers_version": "4.39.0" +} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/model.safetensors b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9e0effcc9c402e2568be312fa97ab83eb88b6003 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0f63222f6230b490e7a2f9fbc3ed83cd8d224212fe0ee62e6cf8dcab72eb49 +size 4411571040 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_intern_vit.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_intern_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..e32d44616dc9b9a0a83f7f55d30e708a415a9584 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_intern_vit.py @@ -0,0 +1,435 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +from typing import Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from timm.models.layers import DropPath +from torch import nn +from transformers.activations import ACT2FN +from transformers.modeling_outputs import (BaseModelOutput, + BaseModelOutputWithPooling) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import logging + +from .configuration_intern_vit import InternVisionConfig + +try: + try: # v1 + from flash_attn.flash_attn_interface import \ + flash_attn_unpadded_qkvpacked_func + except: # v2 + from flash_attn.flash_attn_interface import \ + flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func + + from flash_attn.bert_padding import pad_input, unpad_input + + has_flash_attn = True +except: + print('FlashAttention is not installed.') + has_flash_attn = False + +logger = logging.get_logger(__name__) + + +class FlashAttention(nn.Module): + """Implement the scaled dot product attention with softmax. + Arguments + --------- + softmax_scale: The temperature to use for the softmax attention. + (default: 1/sqrt(d_keys) where d_keys is computed at + runtime) + attention_dropout: The dropout rate to apply to the attention + (default: 0.0) + """ + + def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None): + super().__init__() + self.softmax_scale = softmax_scale + self.dropout_p = attention_dropout + + def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None, + max_s=None, need_weights=False): + """Implements the multihead softmax attention. + Arguments + --------- + qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None + if unpadded: (nnz, 3, h, d) + key_padding_mask: a bool tensor of shape (B, S) + """ + assert not need_weights + assert qkv.dtype in [torch.float16, torch.bfloat16] + assert qkv.is_cuda + + if cu_seqlens is None: + batch_size = qkv.shape[0] + seqlen = qkv.shape[1] + if key_padding_mask is None: + qkv = rearrange(qkv, 'b s ... -> (b s) ...') + max_s = seqlen + cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, + device=qkv.device) + output = flash_attn_unpadded_qkvpacked_func( + qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, + softmax_scale=self.softmax_scale, causal=causal + ) + output = rearrange(output, '(b s) ... -> b s ...', b=batch_size) + else: + nheads = qkv.shape[-2] + x = rearrange(qkv, 'b s three h d -> b s (three h d)') + x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask) + x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads) + output_unpad = flash_attn_unpadded_qkvpacked_func( + x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, + softmax_scale=self.softmax_scale, causal=causal + ) + output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), + indices, batch_size, seqlen), + 'b s (h d) -> b s h d', h=nheads) + else: + assert max_s is not None + output = flash_attn_unpadded_qkvpacked_func( + qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0, + softmax_scale=self.softmax_scale, causal=causal + ) + + return output, None + + +class InternRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +try: + from apex.normalization import FusedRMSNorm + + InternRMSNorm = FusedRMSNorm # noqa + + logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm') +except ImportError: + # using the normal InternRMSNorm + pass +except Exception: + logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm') + pass + + +NORM2FN = { + 'rms_norm': InternRMSNorm, + 'layer_norm': nn.LayerNorm, +} + + +class InternVisionEmbeddings(nn.Module): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter( + torch.randn(1, 1, self.embed_dim), + ) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def _get_pos_embed(self, pos_embed, H, W): + target_dtype = pos_embed.dtype + pos_embed = pos_embed.float().reshape( + 1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2) + pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False). \ + reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype) + return pos_embed + + def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height] + batch_size, _, height, width = patch_embeds.shape + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + position_embedding = torch.cat([ + self.position_embedding[:, :1, :], + self._get_pos_embed(self.position_embedding[:, 1:, :], height, width) + ], dim=1) + embeddings = embeddings + position_embedding.to(target_dtype) + return embeddings + + +class InternAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.use_flash_attn = config.use_flash_attn and has_flash_attn + if config.use_flash_attn and not has_flash_attn: + print('Warning: Flash Attention is not available, use_flash_attn is set to False.') + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:' + f' {self.num_heads}).' + ) + + self.scale = self.head_dim ** -0.5 + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias) + self.attn_drop = nn.Dropout(config.attention_dropout) + self.proj_drop = nn.Dropout(config.dropout) + + self.qk_normalization = config.qk_normalization + + if self.qk_normalization: + self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps) + + if self.use_flash_attn: + self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout) + self.proj = nn.Linear(self.embed_dim, self.embed_dim) + + def _naive_attn(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + if self.qk_normalization: + B_, H_, N_, D_ = q.shape + q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) + k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2) + + attn = ((q * self.scale) @ k.transpose(-2, -1)) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def _flash_attn(self, x, key_padding_mask=None, need_weights=False): + qkv = self.qkv(x) + qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads) + + if self.qk_normalization: + q, k, v = qkv.unbind(2) + q = self.q_norm(q.flatten(-2, -1)).view(q.shape) + k = self.k_norm(k.flatten(-2, -1)).view(k.shape) + qkv = torch.stack([q, k, v], dim=2) + + context, _ = self.inner_attn( + qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False + ) + outs = self.proj(rearrange(context, 'b s h d -> b s (h d)')) + outs = self.proj_drop(outs) + return outs + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states) + return x + + +class InternMLP(nn.Module): + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + self.act = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class InternVisionEncoderLayer(nn.Module): + def __init__(self, config: InternVisionConfig, drop_path_rate: float): + super().__init__() + self.embed_dim = config.hidden_size + self.intermediate_size = config.intermediate_size + self.norm_type = config.norm_type + + self.attn = InternAttention(config) + self.mlp = InternMLP(config) + self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) + self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) + + self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) + self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) + self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + + def forward( + self, + hidden_states: torch.Tensor, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]: + """ + Args: + hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)` + """ + hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1) + + hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2) + + return hidden_states + + +class InternVisionEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`InternEncoderLayer`]. + + Args: + config (`InternConfig`): + The corresponding vision configuration for the `InternEncoder`. + """ + + def __init__(self, config: InternVisionConfig): + super().__init__() + self.config = config + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)] + self.layers = nn.ModuleList([ + InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)]) + self.gradient_checkpointing = True + + def forward( + self, + inputs_embeds, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + hidden_states = inputs_embeds + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = torch.utils.checkpoint.checkpoint( + encoder_layer, + hidden_states) + else: + layer_outputs = encoder_layer( + hidden_states, + ) + hidden_states = layer_outputs + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states + ) + + +class InternVisionModel(PreTrainedModel): + main_input_name = 'pixel_values' + _supports_flash_attn_2 = True + config_class = InternVisionConfig + _no_split_modules = ['InternVisionEncoderLayer'] + + def __init__(self, config: InternVisionConfig): + super().__init__(config) + self.config = config + + self.embeddings = InternVisionEmbeddings(config) + self.encoder = InternVisionEncoder(config) + + def resize_pos_embeddings(self, old_size, new_size, patch_size): + pos_emb = self.embeddings.position_embedding + _, num_positions, embed_dim = pos_emb.shape + cls_emb = pos_emb[:, :1, :] + pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2) + pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False) + pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1) + pos_emb = torch.cat([cls_emb, pos_emb], dim=1) + self.embeddings.position_embedding = nn.Parameter(pos_emb) + self.embeddings.image_size = new_size + logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size)) + + def get_input_embeddings(self): + return self.embeddings + + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + pixel_embeds: Optional[torch.FloatTensor] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None and pixel_embeds is None: + raise ValueError('You have to specify pixel_values or pixel_embeds') + + if pixel_embeds is not None: + hidden_states = pixel_embeds + else: + if len(pixel_values.shape) == 4: + hidden_states = self.embeddings(pixel_values) + else: + raise ValueError(f'wrong pixel_values size: {pixel_values.shape}') + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = encoder_outputs.last_hidden_state + pooled_output = last_hidden_state[:, 0, :] + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_internlm2.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..7c8c24d873f6ecd152d00fd65371e23ead981e1d --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_internlm2.py @@ -0,0 +1,1415 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/modeling_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch InternLM2 model.""" +import math +import queue +import threading +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from einops import rearrange +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss +from transformers.activations import ACT2FN +from transformers.modeling_outputs import (BaseModelOutputWithPast, + CausalLMOutputWithPast, + SequenceClassifierOutputWithPast) +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import (add_start_docstrings, + add_start_docstrings_to_model_forward, logging, + replace_return_docstrings) + +try: + from transformers.generation.streamers import BaseStreamer +except: # noqa # pylint: disable=bare-except + BaseStreamer = None + +from .configuration_internlm2 import InternLM2Config + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = 'InternLM2Config' + +flash_attn_func, flash_attn_varlen_func = None, None +pad_input, index_first_axis, unpad_input = None, None, None +try: + from flash_attn import flash_attn_func as _flash_attn_func + from flash_attn import flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis as _index_first_axis + from flash_attn.bert_padding import pad_input as _pad_input + from flash_attn.bert_padding import unpad_input as _unpad_input + + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + has_flash_attn = True +except: + has_flash_attn = False + + +def _import_flash_attn(): + global flash_attn_func, flash_attn_varlen_func + global pad_input, index_first_axis, unpad_input + try: + from flash_attn import flash_attn_func as _flash_attn_func + from flash_attn import \ + flash_attn_varlen_func as _flash_attn_varlen_func + from flash_attn.bert_padding import \ + index_first_axis as _index_first_axis + from flash_attn.bert_padding import pad_input as _pad_input + from flash_attn.bert_padding import unpad_input as _unpad_input + flash_attn_func, flash_attn_varlen_func = _flash_attn_func, _flash_attn_varlen_func + pad_input, index_first_axis, unpad_input = _pad_input, _index_first_axis, _unpad_input + except ImportError: + raise ImportError('flash_attn is not installed.') + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min, device=device), device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->InternLM2 +class InternLM2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + InternLM2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +# Copied from transformers.model.llama.modeling_llama.LlamaRotaryEmbedding with Llama->InternLM2 +class InternLM2RotaryEmbedding(nn.Module): + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer('inv_freq', inv_freq, persistent=False) + + # Build here to make `torch.jit.trace` work. + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype) + + freqs = torch.einsum('i,j->ij', t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False) + self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=torch.float32) + + return ( + self.cos_cached[:seq_len].to(dtype=x.dtype), + self.sin_cached[:seq_len].to(dtype=x.dtype), + ) + + +# Copied from transformers.model.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype) + t = t / self.scaling_factor + + freqs = torch.einsum('i,j->ij', t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False) + self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->InternLM2 +class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding): + """InternLM2RotaryEmbedding extended with Dynamic NTK scaling. + Credits to the Reddit users /u/bloc97 and /u/emozilla. + """ + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0): + self.scaling_factor = scaling_factor + super().__init__(dim, max_position_embeddings, base, device) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + + if seq_len > self.max_position_embeddings: + base = self.base * ( + (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1) + ) ** (self.dim / (self.dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer('inv_freq', inv_freq, persistent=False) + + t = torch.arange(self.max_seq_len_cached, device=device).to(dtype=self.inv_freq.dtype) + + freqs = torch.einsum('i,j->ij', t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer('cos_cached', emb.cos().to(dtype), persistent=False) + self.register_buffer('sin_cached', emb.sin().to(dtype), persistent=False) + + +# Copied from transformers.model.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.model.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors.""" + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +class InternLM2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x)) + + return down_proj + + +# Copied from transformers.model.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Modified from transformers.model.llama.modeling_llama.LlamaAttention +class InternLM2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: InternLM2Config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.is_causal = True + + if (self.head_dim * self.num_heads) != self.hidden_size: + raise ValueError( + f'hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}' + f' and `num_heads`: {self.num_heads}).' + ) + + self.wqkv = nn.Linear( + self.hidden_size, + (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim, + bias=config.bias, + ) + + self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias) + self._init_rope() + + def _init_rope(self): + if self.config.rope_scaling is None: + self.rotary_emb = InternLM2RotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + ) + else: + scaling_type = self.config.rope_scaling['type'] + scaling_factor = self.config.rope_scaling['factor'] + if scaling_type == 'dynamic': + self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + elif scaling_type == 'linear': + self.rotary_emb = InternLM2LinearScalingRotaryEmbedding( + self.head_dim, + max_position_embeddings=self.max_position_embeddings, + base=self.config.rope_theta, + scaling_factor=scaling_factor, + ) + else: + raise ValueError("Currently we only support rotary embedding's type being 'dynamic' or 'linear'.") + return self.rotary_emb + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if 'padding_mask' in kwargs: + warnings.warn( + 'Passing `padding_mask` is deprecated and will be removed in v4.37. ' + 'Please make sure use `attention_mask` instead.`' + ) + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + 'b q (h gs d) -> b q h gs d', + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d') + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f'Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is' + f' {attn_weights.size()}' + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f'Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}' + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f'`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is' + f' {attn_output.size()}' + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Modified from transformers.model.llama.modeling_llama.InternLM2FlashAttention2 +class InternLM2FlashAttention2(InternLM2Attention): + """ + InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # InternLM2FlashAttention2 attention does not support output_attentions + if 'padding_mask' in kwargs: + warnings.warn( + 'Passing `padding_mask` is deprecated and will be removed in v4.37. ' + 'Please make sure use `attention_mask` instead.`' + ) + + # overwrite attention_mask with padding_mask + attention_mask = kwargs.pop('padding_mask') + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + qkv_states = self.wqkv(hidden_states) + + qkv_states = rearrange( + qkv_states, + 'b q (h gs d) -> b q h gs d', + gs=2 + self.num_key_value_groups, + d=self.head_dim, + ) + + query_states = qkv_states[..., : self.num_key_value_groups, :] + query_states = rearrange(query_states, 'b q h gs d -> b q (h gs) d') + key_states = qkv_states[..., -2, :] + value_states = qkv_states[..., -1, :] + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len + ) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous() + attn_output = self.wo(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`int`, *optional*): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + # Contains at least one padding token in the sequence + causal = self.is_causal and query_length != 1 + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._unpad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + def _unpad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q.to(torch.int64), + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +INTERNLM2_ATTENTION_CLASSES = { + 'eager': InternLM2Attention, + 'flash_attention_2': InternLM2FlashAttention2, +} + + +# Modified from transformers.model.llama.modeling_llama.LlamaDecoderLayer +class InternLM2DecoderLayer(nn.Module): + def __init__(self, config: InternLM2Config): + super().__init__() + self.hidden_size = config.hidden_size + + self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config) + + self.feed_forward = InternLM2MLP(config) + self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + if 'padding_mask' in kwargs: + warnings.warn( + 'Passing `padding_mask` is deprecated and will be removed in v4.37. ' + 'Please make sure use `attention_mask` instead.`' + ) + + residual = hidden_states + + hidden_states = self.attention_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.ffn_norm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +InternLM2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`InternLM2Config`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2 +@add_start_docstrings( + 'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.', + InternLM2_START_DOCSTRING, +) +class InternLM2PreTrainedModel(PreTrainedModel): + config_class = InternLM2Config + base_model_prefix = 'model' + supports_gradient_checkpointing = True + _no_split_modules = ['InternLM2DecoderLayer'] + _skip_keys_device_placement = 'past_key_values' + _supports_flash_attn_2 = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +InternLM2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or + when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Modified from transformers.model.llama.modeling_llama.LlamaModel +@add_start_docstrings( + 'The bare InternLM2 Model outputting raw hidden-states without any specific head on top.', + InternLM2_START_DOCSTRING, +) +class InternLM2Model(InternLM2PreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`] + + Args: + config: InternLM2Config + """ + + _auto_class = 'AutoModel' + + def __init__(self, config: InternLM2Config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + self.config = config + if not has_flash_attn: + self.config.attn_implementation = 'eager' + print('Warning: Flash attention is not available, using eager attention instead.') + + self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + + self.layers = nn.ModuleList([InternLM2DecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.tok_embeddings + + def set_input_embeddings(self, value): + self.tok_embeddings = value + + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.attn_implementation == 'flash_attention_2': + _import_flash_attn() + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time') + elif input_ids is not None: + batch_size, seq_length = input_ids.shape[:2] + elif inputs_embeds is not None: + batch_size, seq_length = inputs_embeds.shape[:2] + else: + raise ValueError('You have to specify either input_ids or inputs_embeds') + + seq_length_with_past = seq_length + past_key_values_length = 0 + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0) + + if inputs_embeds is None: + inputs_embeds = self.tok_embeddings(input_ids) + + if self.config.attn_implementation == 'flash_attention_2': + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + else: + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + # embed positions + hidden_states = inputs_embeds + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + '`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...' + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + # None for past_key_value + return module(*inputs, output_attentions, None) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + attention_mask, + position_ids, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + +# Modified from transformers.model.llama.modeling_llama.LlamaForCausalLM +class InternLM2ForCausalLM(InternLM2PreTrainedModel): + _auto_class = 'AutoModelForCausalLM' + + _tied_weights_keys = ['output.weight'] + + def __init__(self, config): + super().__init__(config) + self.model = InternLM2Model(config) + self.vocab_size = config.vocab_size + self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + def get_output_embeddings(self): + return self.output + + def set_output_embeddings(self, new_embeddings): + self.output = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, InternLM2ForCausalLM + + >>> model = InternLM2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS) + >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER) + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.output(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + device = input_ids.device if input_ids is not None else inputs_embeds.device + output = CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + output['logits'] = output['logits'].to(device) + return output + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + position_ids = kwargs.get('position_ids', None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {'inputs_embeds': inputs_embeds} + else: + model_inputs = {'input_ids': input_ids} + + model_inputs.update( + { + 'position_ids': position_ids, + 'past_key_values': past_key_values, + 'use_cache': kwargs.get('use_cache'), + 'attention_mask': attention_mask, + } + ) + return model_inputs + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=''): + if tokenizer.add_bos_token: + prompt = '' + else: + prompt = tokenizer.bos_token + if meta_instruction: + prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n""" + for record in history: + prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n""" + prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n""" + return tokenizer([prompt], return_tensors='pt') + + @torch.no_grad() + def chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + streamer: Optional[BaseStreamer] = None, + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + meta_instruction: str = 'You are an AI assistant whose name is InternLM (书生·浦语).\n' + '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n' + '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.', + **kwargs, + ): + inputs = self.build_inputs(tokenizer, query, history, meta_instruction) + inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)} + # also add end-of-assistant token in eos token id to avoid unnecessary generation + eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(['<|im_end|>'])[0]] + outputs = self.generate( + **inputs, + streamer=streamer, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + eos_token_id=eos_token_id, + **kwargs, + ) + outputs = outputs[0].cpu().tolist()[len(inputs['input_ids'][0]) :] + response = tokenizer.decode(outputs, skip_special_tokens=True) + response = response.split('<|im_end|>')[0] + history = history + [(query, response)] + return response, history + + @torch.no_grad() + def stream_chat( + self, + tokenizer, + query: str, + history: List[Tuple[str, str]] = [], + max_new_tokens: int = 1024, + do_sample: bool = True, + temperature: float = 0.8, + top_p: float = 0.8, + **kwargs, + ): + """ + Return a generator in format: (response, history) + Eg. + ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')]) + ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')]) + """ + if BaseStreamer is None: + raise ModuleNotFoundError( + 'The version of `transformers` is too low. Please make sure ' + 'that you have installed `transformers>=4.28.0`.' + ) + + response_queue = queue.Queue(maxsize=20) + + class ChatStreamer(BaseStreamer): + def __init__(self, tokenizer) -> None: + super().__init__() + self.tokenizer = tokenizer + self.queue = response_queue + self.query = query + self.history = history + self.response = '' + self.cache = [] + self.received_inputs = False + self.queue.put((self.response, history + [(self.query, self.response)])) + + def put(self, value): + if len(value.shape) > 1 and value.shape[0] > 1: + raise ValueError('ChatStreamer only supports batch size 1') + elif len(value.shape) > 1: + value = value[0] + + if not self.received_inputs: + # The first received value is input_ids, ignore here + self.received_inputs = True + return + + self.cache.extend(value.tolist()) + token = self.tokenizer.decode(self.cache, skip_special_tokens=True) + if token.strip() != '<|im_end|>': + self.response = self.response + token + history = self.history + [(self.query, self.response)] + self.queue.put((self.response, history)) + self.cache = [] + else: + self.end() + + def end(self): + self.queue.put(None) + + def stream_producer(): + return self.chat( + tokenizer=tokenizer, + query=query, + streamer=ChatStreamer(tokenizer=tokenizer), + history=history, + max_new_tokens=max_new_tokens, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + **kwargs, + ) + + def consumer(): + producer = threading.Thread(target=stream_producer) + producer.start() + while True: + res = response_queue.get() + if res is None: + return + yield res + + return consumer() + + +# Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2 +@add_start_docstrings( + """ + The InternLM2 Model transformer with a sequence classification head on top (linear layer). + + [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, + as other causal models (e.g. GPT-2) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + InternLM2_START_DOCSTRING, +) +class InternLM2ForSequenceClassification(InternLM2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.model = InternLM2Model(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.tok_embeddings + + def set_input_embeddings(self, value): + self.model.tok_embeddings = value + + @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError('Cannot handle batch sizes > 1 if no padding token is defined.') + if self.config.pad_token_id is None: + sequence_lengths = -1 + else: + if input_ids is not None: + sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to( + logits.device + ) + else: + sequence_lengths = -1 + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths] + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = 'regression' + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = 'single_label_classification' + else: + self.config.problem_type = 'multi_label_classification' + + if self.config.problem_type == 'regression': + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == 'single_label_classification': + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == 'multi_label_classification': + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_internvl_chat.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_internvl_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..3d4f4b03b53b8399e9194a4e436e9ea40b28cdea --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/modeling_internvl_chat.py @@ -0,0 +1,345 @@ +# -------------------------------------------------------- +# InternVL +# Copyright (c) 2024 OpenGVLab +# Licensed under The MIT License [see LICENSE for details] +# -------------------------------------------------------- +import warnings +from typing import Any, List, Optional, Tuple, Union + +import torch.utils.checkpoint +import transformers +from torch import nn +from torch.nn import CrossEntropyLoss +from transformers import (AutoModel, GenerationConfig, LlamaForCausalLM, + LlamaTokenizer) +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.utils import ModelOutput, logging + +from .configuration_internvl_chat import InternVLChatConfig +from .conversation import get_conv_template +from .modeling_intern_vit import InternVisionModel +from .modeling_internlm2 import InternLM2ForCausalLM + +logger = logging.get_logger(__name__) + + +def version_cmp(v1, v2, op='eq'): + import operator + + from packaging import version + op_func = getattr(operator, op) + return op_func(version.parse(v1), version.parse(v2)) + + +class InternVLChatModel(PreTrainedModel): + config_class = InternVLChatConfig + main_input_name = 'pixel_values' + _supports_flash_attn_2 = True + _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer'] + + def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None): + super().__init__(config) + + assert version_cmp(transformers.__version__, '4.36.2', 'ge') + image_size = config.force_image_size or config.vision_config.image_size + patch_size = config.vision_config.patch_size + self.patch_size = patch_size + self.select_layer = config.select_layer + self.template = config.template + self.num_image_token = int((image_size // patch_size) ** 2 * (config.downsample_ratio ** 2)) + self.downsample_ratio = config.downsample_ratio + self.ps_version = config.ps_version + + logger.info(f'num_image_token: {self.num_image_token}') + logger.info(f'ps_version: {self.ps_version}') + if vision_model is not None: + self.vision_model = vision_model + else: + self.vision_model = InternVisionModel(config.vision_config) + if language_model is not None: + self.language_model = language_model + else: + if config.llm_config.architectures[0] == 'LlamaForCausalLM': + self.language_model = LlamaForCausalLM(config.llm_config) + elif config.llm_config.architectures[0] == 'InternLM2ForCausalLM': + self.language_model = InternLM2ForCausalLM(config.llm_config) + else: + raise NotImplementedError(f'{config.llm_config.architectures[0]} is not implemented.') + + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.llm_config.hidden_size + + self.mlp1 = nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio) ** 2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio) ** 2, llm_hidden_size), + nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size) + ) + + self.img_context_token_id = None + self.conv_template = get_conv_template(self.template) + self.system_message = self.conv_template.system_message + + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + image_flags: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + image_flags = image_flags.squeeze(-1) + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + vit_embeds = self.extract_feature(pixel_values) + vit_embeds = vit_embeds[image_flags == 1] + vit_batch_size = pixel_values.shape[0] + + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + if torch.distributed.get_rank() == 0: + print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}') + + input_ids = input_ids.reshape(B * N) + selected = (input_ids == self.img_context_token_id) + try: + input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds.reshape(-1, C) + except Exception as e: + vit_embeds = vit_embeds.reshape(-1, C) + print(f'warning: {e}, input_embeds[selected].shape={input_embeds[selected].shape}, ' + f'vit_embeds.shape={vit_embeds.shape}') + n_token = selected.sum() + input_embeds[selected] = input_embeds[selected] * 0.0 + vit_embeds[:n_token] + + input_embeds = input_embeds.reshape(B, N, C) + + outputs = self.language_model( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.language_model.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def pixel_shuffle(self, x, scale_factor=0.5): + n, w, h, c = x.size() + # N, W, H, C --> N, W, H * scale, C // scale + x = x.view(n, w, int(h * scale_factor), int(c / scale_factor)) + # N, W, H * scale, C // scale --> N, H * scale, W, C // scale + x = x.permute(0, 2, 1, 3).contiguous() + # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2) + x = x.view(n, int(h * scale_factor), int(w * scale_factor), + int(c / (scale_factor * scale_factor))) + if self.ps_version == 'v1': + warnings.warn("In ps_version 'v1', the height and width have not been swapped back, " + 'which results in a transposed image.') + else: + x = x.permute(0, 2, 1, 3).contiguous() + return x + + def extract_feature(self, pixel_values): + if self.select_layer == -1: + vit_embeds = self.vision_model( + pixel_values=pixel_values, + output_hidden_states=False, + return_dict=True).last_hidden_state + else: + vit_embeds = self.vision_model( + pixel_values=pixel_values, + output_hidden_states=True, + return_dict=True).hidden_states[self.select_layer] + vit_embeds = vit_embeds[:, 1:, :] + + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = self.pixel_shuffle(vit_embeds, scale_factor=self.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + vit_embeds = self.mlp1(vit_embeds) + return vit_embeds + + def batch_chat(self, tokenizer, pixel_values, questions, generation_config, num_patches_list=None, + history=None, return_history=False, IMG_START_TOKEN='', IMG_END_TOKEN='', + IMG_CONTEXT_TOKEN='', verbose=False, image_counts=None): + if history is not None or return_history: + print('Now multi-turn chat is not supported in batch_chat.') + raise NotImplementedError + + if image_counts is not None: + num_patches_list = image_counts + print('Warning: `image_counts` is deprecated. Please use `num_patches_list` instead.') + + img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.img_context_token_id = img_context_token_id + + if verbose and pixel_values is not None: + image_bs = pixel_values.shape[0] + print(f'dynamic ViT batch size: {image_bs}') + + queries = [] + for idx, num_patches in enumerate(num_patches_list): + question = questions[idx] + if pixel_values is not None and '' not in question: + question = '\n' + question + template = get_conv_template(self.template) + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN + query = query.replace('', image_tokens, 1) + queries.append(query) + + tokenizer.padding_side = 'left' + model_inputs = tokenizer(queries, return_tensors='pt', padding=True) + input_ids = model_inputs['input_ids'].cuda() + attention_mask = model_inputs['attention_mask'].cuda() + eos_token_id = tokenizer.convert_tokens_to_ids(template.sep) + generation_config['eos_token_id'] = eos_token_id + generation_output = self.generate( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + **generation_config + ) + responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True) + responses = [response.split(template.sep)[0].strip() for response in responses] + return responses + + def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False, + num_patches_list=None, IMG_START_TOKEN='', IMG_END_TOKEN='', IMG_CONTEXT_TOKEN='', + verbose=False): + + if history is None and pixel_values is not None and '' not in question: + question = '\n' + question + + if num_patches_list is None: + num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else [] + assert pixel_values is None or len(pixel_values) == sum(num_patches_list) + + img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + self.img_context_token_id = img_context_token_id + + template = get_conv_template(self.template) + template.system_message = self.system_message + eos_token_id = tokenizer.convert_tokens_to_ids(template.sep) + + history = [] if history is None else history + for (old_question, old_answer) in history: + template.append_message(template.roles[0], old_question) + template.append_message(template.roles[1], old_answer) + template.append_message(template.roles[0], question) + template.append_message(template.roles[1], None) + query = template.get_prompt() + + if verbose and pixel_values is not None: + image_bs = pixel_values.shape[0] + print(f'dynamic ViT batch size: {image_bs}') + + for num_patches in num_patches_list: + image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.num_image_token * num_patches + IMG_END_TOKEN + query = query.replace('', image_tokens, 1) + + model_inputs = tokenizer(query, return_tensors='pt') + input_ids = model_inputs['input_ids'].cuda() + attention_mask = model_inputs['attention_mask'].cuda() + generation_config['eos_token_id'] = eos_token_id + generation_output = self.generate( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + **generation_config + ) + response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0] + response = response.split(template.sep)[0].strip() + history.append((question, response)) + if return_history: + return response, history + else: + query_to_print = query.replace(IMG_CONTEXT_TOKEN, '') + query_to_print = query_to_print.replace(f'{IMG_START_TOKEN}{IMG_END_TOKEN}', '') + if verbose: + print(query_to_print, response) + return response + + @torch.no_grad() + def generate( + self, + pixel_values: Optional[torch.FloatTensor] = None, + input_ids: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + visual_features: Optional[torch.FloatTensor] = None, + generation_config: Optional[GenerationConfig] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **generate_kwargs, + ) -> torch.LongTensor: + + assert self.img_context_token_id is not None + if pixel_values is not None: + if visual_features is not None: + vit_embeds = visual_features + else: + vit_embeds = self.extract_feature(pixel_values) + input_embeds = self.language_model.get_input_embeddings()(input_ids) + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + input_ids = input_ids.reshape(B * N) + selected = (input_ids == self.img_context_token_id) + assert selected.sum() != 0 + input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) + + input_embeds = input_embeds.reshape(B, N, C) + else: + input_embeds = self.language_model.get_input_embeddings()(input_ids) + + outputs = self.language_model.generate( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + generation_config=generation_config, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=True, + **generate_kwargs, + ) + + return outputs diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/special_tokens_map.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..cbf34a50d27c43ed8d1e2823b800b4e6f66e637a --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/special_tokens_map.json @@ -0,0 +1,47 @@ +{ + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|action_start|>", + "<|action_end|>", + "<|interpreter|>", + "<|plugin|>", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenization_internlm2.py b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenization_internlm2.py new file mode 100644 index 0000000000000000000000000000000000000000..1be581da37ef678de65f2737493fc0ed7160446e --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenization_internlm2.py @@ -0,0 +1,235 @@ +# Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved. +# +# This code is based on transformers/src/transformers/models/llama/tokenization_llama.py +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tokenization classes for InternLM.""" +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import logging + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'} + +PRETRAINED_VOCAB_FILES_MAP = {} + + +# Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer +class InternLM2Tokenizer(PreTrainedTokenizer): + """ + Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + model_input_names = ['input_ids', 'attention_mask'] + _auto_class = 'AutoTokenizer' + + def __init__( + self, + vocab_file, + unk_token='', + bos_token='', + eos_token='', + pad_token='', + sp_model_kwargs: Optional[Dict[str, Any]] = None, + add_bos_token=True, + add_eos_token=False, + decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.vocab_file = vocab_file + self.add_bos_token = add_bos_token + self.add_eos_token = add_eos_token + self.decode_with_prefix_space = decode_with_prefix_space + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + self._no_prefix_space_tokens = None + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + def no_prefix_space_tokens(self): + if self._no_prefix_space_tokens is None: + vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) + self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith('▁')} + return self._no_prefix_space_tokens + + @property + def vocab_size(self): + """Returns vocab size""" + return self.sp_model.get_piece_size() + + @property + def bos_token_id(self) -> Optional[int]: + return self.sp_model.bos_id() + + @property + def eos_token_id(self) -> Optional[int]: + return self.sp_model.eos_id() + + def get_vocab(self): + """Returns vocab as a dict""" + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + """Returns a tokenized string.""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def _maybe_add_prefix_space(self, tokens, decoded): + if tokens and tokens[0] not in self.no_prefix_space_tokens: + return ' ' + decoded + else: + return decoded + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = '' + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += ' ' + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + out_string = self.clean_up_tokenization(out_string) + out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) + return out_string[1:] + + def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save the vocabulary and special tokens file to a directory. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + if not os.path.isdir(save_directory): + logger.error(f'Vocabulary path ({save_directory}) should be a directory') + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, 'wb') as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + if self.add_bos_token: + bos_token_ids = [self.bos_token_id] + else: + bos_token_ids = [] + + output = bos_token_ids + token_ids_0 + + if token_ids_1 is not None: + output = output + token_ids_1 + + if self.add_eos_token: + output = output + [self.eos_token_id] + + return output + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make + use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenizer.model b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6600712949ca9c4ffb50f25275993a21fba0b408 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b +size 1477754 diff --git a/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenizer_config.json b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1f32946df0f56d92ddbc1df79cabb4477b622480 --- /dev/null +++ b/work_dirs/internvl_v2_internlm2_2b_lora_finetune_food/lr35_ep10/tokenizer_config.json @@ -0,0 +1,179 @@ +{ + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92538": { + "content": "<|plugin|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92539": { + "content": "<|interpreter|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92540": { + "content": "<|action_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92541": { + "content": "<|action_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92542": { + "content": "<|im_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92543": { + "content": "<|im_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92544": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92545": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92546": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92547": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92548": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92549": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92550": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92551": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "92552": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "additional_special_tokens": [ + "<|im_start|>", + "<|im_end|>", + "<|action_start|>", + "<|action_end|>", + "<|interpreter|>", + "<|plugin|>", + "", + "", + "", + "", + "", + "", + "", + "", + "" + ], + "auto_map": { + "AutoTokenizer": [ + "tokenization_internlm2.InternLM2Tokenizer", + null + ] + }, + "bos_token": "", + "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "model_max_length": 8192, + "pad_token": "", + "tokenizer_class": "InternLM2Tokenizer", + "unk_token": "" +} diff --git a/xtuner_config/internvl_v2_internlm2_2b_lora_finetune_food.py b/xtuner_config/internvl_v2_internlm2_2b_lora_finetune_food.py new file mode 100644 index 0000000000000000000000000000000000000000..43af705e5baa4af43ca1397b200aab53b6fef986 --- /dev/null +++ b/xtuner_config/internvl_v2_internlm2_2b_lora_finetune_food.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR, LinearLR +from peft import LoraConfig +from torch.optim import AdamW +from transformers import AutoTokenizer + +from xtuner.dataset import InternVL_V1_5_Dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.samplers import LengthGroupedSampler +from xtuner.engine.hooks import DatasetInfoHook +from xtuner.engine.runner import TrainLoop +from xtuner.model import InternVL_V1_5 +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +path = '/root/share/new_models/OpenGVLab/InternVL2-2B' + +# Data +data_root = '/root/share/datasets/FoodieQA/' # your data path +data_path = data_root + 'sivqa_llava.json' +image_folder = data_root # your image folder path +prompt_template = PROMPT_TEMPLATE.internlm2_chat +max_length = 8192 + +# Scheduler & Optimizer +batch_size = 4 # per_device +accumulative_counts = 2 +dataloader_num_workers = 4 +max_epochs = 10 +optim_type = AdamW +# official 1024 -> 4e-5 +# lr = 1e-6 +lr = 3e-5 +betas = (0.9, 0.999) +weight_decay = 0.05 +max_norm = 1 # grad clip +warmup_ratio = 0.03 + +# Save +save_steps = 64 +save_total_limit = -1 # Maximum checkpoints to keep (-1 means unlimited) + +####################################################################### +# PART 2 Model & Tokenizer & Image Processor # +####################################################################### +model = dict( + type=InternVL_V1_5, + model_path=path, + freeze_llm=True, + freeze_visual_encoder=True, + # comment the following lines if you don't want to use Lora in llm + llm_lora=dict( + type=LoraConfig, + r=128, + lora_alpha=256, + lora_dropout=0.05, + target_modules=None, + task_type='CAUSAL_LM'), + # uncomment the following lines if you don't want to use Lora in visual encoder # noqa + # visual_encoder_lora=dict( + # type=LoraConfig, r=64, lora_alpha=16, lora_dropout=0.05, + # target_modules=['attn.qkv', 'attn.proj', 'mlp.fc1', 'mlp.fc2']) +) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +llava_dataset = dict( + type=InternVL_V1_5_Dataset, + model_path=path, + data_paths=data_path, + image_folders=image_folder, + template=prompt_template, + max_length=max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=llava_dataset, + sampler=dict( + type=LengthGroupedSampler, + length_property='modality_length', + per_device_batch_size=batch_size * accumulative_counts), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = [ + dict( + type=LinearLR, + start_factor=1e-5, + by_epoch=True, + begin=0, + end=warmup_ratio * max_epochs, + convert_to_iter_based=True), + dict( + type=CosineAnnealingLR, + eta_min=0.0, + by_epoch=True, + begin=warmup_ratio * max_epochs, + end=max_epochs, + convert_to_iter_based=True) +] + +# train, val, test setting +train_cfg = dict(type=TrainLoop, max_epochs=max_epochs) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=path, + trust_remote_code=True) + +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 10 iterations. + logger=dict(type=LoggerHook, log_metric_by_epoch=False, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per `save_steps`. + checkpoint=dict( + type=CheckpointHook, + save_optimizer=False, + by_epoch=False, + interval=save_steps, + max_keep_ckpts=save_total_limit), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) + +# set log processor +log_processor = dict(by_epoch=False)