h2ogpt-chatbot2

Runtime error

App Files Files Community

arnocandel commited on Apr 21, 2023

Commit

8c85f5b

1 Parent(s): c7b6f0f

Update with h2oGPT hash c146f88a7a4b65fe180dae6d92358a898b140e4a

Browse files

Files changed (3) hide show

app.py +122 -44
finetune.py +5 -1
utils.py +62 -0

app.py CHANGED Viewed

@@ -4,8 +4,7 @@ import sys
 import os
 import traceback
 import typing
-from utils import set_seed, flatten_list, clear_torch_cache, system_info_print
 SEED = 1236
 set_seed(SEED)
@@ -27,6 +26,12 @@ from finetune import get_loaders, example_data_points, generate_prompt, get_gith
     human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
 from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
 def main(
         load_8bit: bool = False,
@@ -53,6 +58,7 @@ def main(
         llama_type: bool = None,
         debug: bool = False,
         share: bool = True,
         local_files_only: bool = False,
         resume_download: bool = True,
@@ -90,15 +96,23 @@ def main(
 ):
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
-    # override share if in spaces
-    if os.environ.get("HUGGINGFACE_SPACES"):
-        share = False
-        base_model = 'h2oai/h2ogpt-oasst1-512-12b'
-        load_8bit = True
-        temperature = 0.7
-        top_p = 1
-        top_k = 100
         do_sample = True
     # get defaults
     model_lower = base_model.lower()
@@ -166,7 +180,7 @@ def main(
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
-                fun = partial(evaluate, model_state, debug=debug, chat=chat)
             else:
                 assert eval_sharegpt_prompts_only > 0
@@ -202,7 +216,7 @@ def main(
                             assert ex[1] in [None, '']  # should be no iinput
                             assert ex[2] in [None, '']  # should be no context
                             prompt = ex[0]
-                        cutoff_len = 768 if os.environ.get("HUGGINGFACE_SPACES") else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
                                             truncation=True,
@@ -215,8 +229,9 @@ def main(
                             score = 0.0
                             clear_torch_cache()
                         except RuntimeError as e:
-                            if 'Expected all tensors to be on the same device' in str(
-                                    e) or 'expected scalar type Half but found Float' in str(e):
                                 print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
                                       flush=True)
                                 traceback.print_exc()
@@ -526,11 +541,12 @@ def go_gradio(**kwargs):
                       """
     else:
         description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
-    if os.environ.get("HUGGINGFACE_SPACES"):
         description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
         if kwargs['load_8bit']:
-            description += """<i><li> Model is loaded in 8-bit and HF spaces version has other limitations in order to fit on HF GPUs, so UX can be worse than native app.</i></li>"""
-        description += """<i><li>Model loading and unloading disabled on HF SPACES to avoid GPU OOM for multi-user environment.</i></li></ul></p>"""
     if kwargs['verbose']:
         task_info_md = f"""
@@ -538,14 +554,43 @@ def go_gradio(**kwargs):
     else:
         task_info_md = ''
-    css_code = """footer {visibility: hidden}
-body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en/site/header/master/_jcr_content/root/container/header_copy/logo.coreimg.svg/1678976605175/h2o-logo.svg");}}"""
-    from gradio.themes.utils import colors, fonts, sizes
     if kwargs['h2ocolors']:
-        colors_dict = dict(primary_hue=colors.yellow,
-                           secondary_hue=colors.yellow,
-                           neutral_hue=colors.gray,
                            spacing_size=sizes.spacing_md,
                            radius_size=sizes.radius_md,
                            text_size=sizes.text_md,
@@ -617,12 +662,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
             {description}
             {task_info_md}
             """)
-        if os.environ.get("HUGGINGFACE_SPACES"):
             gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
         base_wanted = bool(kwargs['base_model']) and kwargs['login_mode_if_model0']
-        go_btn = gr.Button(value="LOGIN", visible=base_wanted, variant="primary")
         normal_block = gr.Row(visible=not base_wanted)
         with normal_block:
             with gr.Tabs():
@@ -685,7 +730,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                                                    value=kwargs['stream_output'])
                             prompt_type = gr.Dropdown(prompt_types_strings,
                                                       value=kwargs['prompt_type'], label="Prompt Type",
-                                                      visible=not os.environ.get("HUGGINGFACE_SPACES"))
                             temperature = gr.Slider(minimum=0, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
@@ -698,12 +743,12 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                 value=kwargs['top_k'], label="Top k",
                                 info='Num. tokens to sample from'
                             )
-                            max_beams = 8 if not os.environ.get("HUGGINGFACE_SPACES") else 2
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                                                        "Uses more GPU memory/compute")
-                            max_max_new_tokens = 2048 if not os.environ.get("HUGGINGFACE_SPACES") else kwargs['max_new_tokens']
                             max_new_tokens = gr.Slider(
                                 minimum=1, maximum=max_max_new_tokens, step=1,
                                 value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
@@ -714,7 +759,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                             )
                             early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                          value=kwargs['early_stopping'])
-                            max_max_time = 60 * 5 if not os.environ.get("HUGGINGFACE_SPACES") else 60
                             max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
                                                  value=min(max_max_time, kwargs['max_time']), label="Max. time",
                                                  info="Max. time to search optimal output.")
@@ -724,17 +769,17 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                             num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
                                                              value=kwargs['num_return_sequences'],
                                                              label="Number Returns", info="Must be <= num_beams",
-                                                             visible=not os.environ.get("HUGGINGFACE_SPACES"))
                             do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
                                                     value=kwargs['do_sample'])
                             if kwargs['chat']:
                                 iinput = gr.Textbox(lines=4, label="Input",
                                                     placeholder=kwargs['placeholder_input'],
-                                                    visible=not os.environ.get("HUGGINGFACE_SPACES"))
                             # nominally empty for chat mode
                             context = gr.Textbox(lines=1, label="Context",
                                                  info="Ignored in chat mode.",
-                                                 visible=not os.environ.get("HUGGINGFACE_SPACES"))
                 with gr.TabItem("Models"):
                     with gr.Row():
@@ -744,8 +789,8 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                     model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
                                     lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
-                                    load_msg = "Load Model/LORA" if not os.environ.get("HUGGINGFACE_SPACES") \
-                                        else "LOAD DISABLED ON HF SPACES"
                                     load_model_button = gr.Button(load_msg)
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
@@ -757,12 +802,27 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                     add_model_button = gr.Button("Add new model name")
                                     add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
                 with gr.TabItem("System"):
-                    with gr.Row():
                         with gr.Column():
                             system_text = gr.Textbox(label='System Info')
                             system_btn = gr.Button(value='Get System Info')
         inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
         from functools import partial
         all_kwargs = kwargs.copy()
@@ -811,7 +871,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                     len(history[-1]) >= 2:
                 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
-                max_length_tokenize = 512 if os.environ.get("HUGGINGFACE_SPACES") else 2048
                 cutoff_len = max_length_tokenize*4  # restrict deberta related to max for LLM
                 question = history[-1][0]
@@ -833,7 +893,9 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                     clear_torch_cache()
                     return 'Response Score: GPU OOM'
                 except RuntimeError as e:
-                    if 'Expected all tensors to be on the same device' in str(e) or 'expected scalar type Half but found Float' in str(e):
                         print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
                         traceback.print_exc()
                         clear_torch_cache()
@@ -1025,7 +1087,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
-        if not os.environ.get("HUGGINGFACE_SPACES"):
             load_model_event = load_model_button.click(**load_model_args) \
                                                  .then(**prompt_update_args) \
                                                  .then(**chatbot_update_args) \
@@ -1079,7 +1141,7 @@ body{background-image:url("https://h2o.ai/content/experience-fragments/h2o/us/en
 input_args_list = ['model_state']
-inputs_kwargs_list = ['debug', 'chat', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
 def get_inputs_list(inputs_dict, model_lower):
@@ -1142,6 +1204,7 @@ def evaluate(
         src_lang=None,
         tgt_lang=None,
         debug=False,
         chat=False,
         hard_stop_list=None,
         sanitize_bot_response=True,
@@ -1204,7 +1267,7 @@ def evaluate(
             # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
             # stopping only starts once output is beyond prompt
             # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
-            stop_words = [human, bot]
             encounters = [1, 2]
         elif prompt_type == 'instruct_vicuna':
             # even below is not enough, generic strings and many ways to encode
@@ -1235,6 +1298,9 @@ def evaluate(
         # avoid padding in front of tokens
         if tokenizer.pad_token:
             stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
     else:
         stopping_criteria = StoppingCriteriaList()
@@ -1243,7 +1309,7 @@ def evaluate(
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
     # with - 256
-    max_length_tokenize = 768 - 256 if os.environ.get("HUGGINGFACE_SPACES") else 2048 - 256
     cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
     output_smallest = 30 * 4
     prompt = prompt[-cutoff_len - output_smallest:]
@@ -1332,8 +1398,9 @@ def evaluate(
                     clear_torch_cache()
                     return
                 except RuntimeError as e:
-                    if 'Expected all tensors to be on the same device' in str(
-                            e) or 'expected scalar type Half but found Float' in str(e):
                         print(
                             "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
                             flush=True)
@@ -1343,6 +1410,7 @@ def evaluate(
                     else:
                         raise
             for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
                 decoded_output = decoder(output)
                 if output[-1] in [tokenizer.eos_token_id]:
@@ -1353,12 +1421,16 @@ def evaluate(
                     raise StopIteration
                 yield prompter.get_response(decoded_output, prompt=inputs_decoded,
                                             sanitize_bot_response=sanitize_bot_response)
-            return
         else:
             outputs = model.generate(**gen_kwargs)
             outputs = [decoder(s) for s in outputs.sequences]
             yield prompter.get_response(outputs, prompt=inputs_decoded,
                                         sanitize_bot_response=sanitize_bot_response)
 def get_generate_params(model_lower, chat,
@@ -1569,5 +1641,11 @@ if __name__ == "__main__":
     python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
     """, flush=True)
     fire.Fire(main)

 import os
 import traceback
 import typing
+from utils import set_seed, flatten_list, clear_torch_cache, system_info_print, zip_data, save_generate_output
 SEED = 1236
 set_seed(SEED)
     human, bot, prompt_type_to_model_name, inv_prompt_type_to_model_lower
 from stopping import CallbackToGenerator, Stream, StoppingCriteriaSub
+is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
+is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
+is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
+is_low_mem = is_hf  # assumes run on 24GB consumer GPU
+admin_pass = os.getenv("ADMIN_PASS")
 def main(
         load_8bit: bool = False,
         llama_type: bool = None,
         debug: bool = False,
+        save_dir: str = None,
         share: bool = True,
         local_files_only: bool = False,
         resume_download: bool = True,
 ):
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
+    if is_public:
+        temperature = 0.4
+        top_p = 0.85
+        top_k = 70
         do_sample = True
+        if is_low_mem:
+            base_model = 'h2oai/h2ogpt-oasst1-512-12b'
+            load_8bit = True
+        else:
+            base_model = 'h2oai/h2ogpt-oasst1-512-20b'
+    if is_low_mem:
+        load_8bit = True
+    if is_hf:
+        # must override share if in spaces
+        share = False
+    save_dir = os.getenv('SAVE_DIR', save_dir)
     # get defaults
     model_lower = base_model.lower()
             if not eval_sharegpt_as_output:
                 model, tokenizer, device = get_model(**locals())
                 model_state = [model, tokenizer, device, base_model]
+                fun = partial(evaluate, model_state, debug=debug, chat=chat, save_dir=save_dir)
             else:
                 assert eval_sharegpt_prompts_only > 0
                             assert ex[1] in [None, '']  # should be no iinput
                             assert ex[2] in [None, '']  # should be no context
                             prompt = ex[0]
+                        cutoff_len = 768 if is_low_mem else 2048
                         inputs = stokenizer(prompt, res,
                                             return_tensors="pt",
                                             truncation=True,
                             score = 0.0
                             clear_torch_cache()
                         except RuntimeError as e:
+                            if 'Expected all tensors to be on the same device' in str(e) or \
+                                    'expected scalar type Half but found Float' in str(e) or \
+                                    'probability tensor contains either' in str(e):
                                 print("GPU error: question: %s answer: %s exception: %s" % (prompt, res, str(e)),
                                       flush=True)
                                 traceback.print_exc()
                       """
     else:
         description = "For more information, visit [the project's website](https://github.com/h2oai/h2ogpt).<br>"
+    if is_public:
         description += """<p><b> DISCLAIMERS: </b><ul><i><li>The data used to train this model include The Pile and other sources. These may contain objectionable content, so the model may reproduce that material. Use application and responses at own risk.</i></li>"""
         if kwargs['load_8bit']:
+            description += """<i><li> Model is loaded in 8-bit, model loading-unloading is disabled, and other limitations exist in order to fit on GPUs with lower amounts of VRAM, so UX can be worse than non-hosted version.</i></li>"""
+        description += """<i><li>Conversations may be used to improve h2oGPT.  Do not share sensitive information.</i></li>"""
+        description += """<i><li>By using h2oGPT, you accept our [Terms of Service](https://github.com/h2oai/h2ogpt/blob/main/tos.md).</i></li></ul></p>"""
     if kwargs['verbose']:
         task_info_md = f"""
     else:
         task_info_md = ''
+    css_code = """footer {visibility: hidden;}
+body{background:linear-gradient(#f5f5f5,#e5e5e5);}
+body.dark{background:linear-gradient(#0d0d0d,#333333);}"""
+    from gradio.themes.utils import Color, colors, fonts, sizes
     if kwargs['h2ocolors']:
+        h2o_yellow = Color(
+            name="yellow",
+            c50="#fffef2",
+            c100="#fff9e6",
+            c200="#ffecb3",
+            c300="#ffe28c",
+            c400="#ffd659",
+            c500="#fec925",
+            c600="#e6ac00",
+            c700="#bf8f00",
+            c800="#a67c00",
+            c900="#664d00",
+            c950="#403000",
+        )
+        h2o_gray = Color(
+            name="gray",
+            c50="#f2f2f2",
+            c100="#e5e5e5",
+            c200="#cccccc",
+            c300="#b2b2b2",
+            c400="#999999",
+            c500="#7f7f7f",
+            c600="#666666",
+            c700="#4c4c4c",
+            c800="#333333",
+            c900="#191919",
+            c950="#0d0d0d",
+        )
+        colors_dict = dict(primary_hue=h2o_yellow,
+                           secondary_hue=h2o_yellow,
+                           neutral_hue=h2o_gray,
                            spacing_size=sizes.spacing_md,
                            radius_size=sizes.radius_md,
                            text_size=sizes.text_md,
             {description}
             {task_info_md}
             """)
+        if is_hf:
             gr.HTML('''<center><a href="https://huggingface.co/spaces/h2oai/h2ogpt-chatbot?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a>Duplicate this Space to skip the queue and run in a private space</center>''')
         # go button visible if
         base_wanted = bool(kwargs['base_model']) and kwargs['login_mode_if_model0']
+        go_btn = gr.Button(value="ENTER", visible=base_wanted, variant="primary")
         normal_block = gr.Row(visible=not base_wanted)
         with normal_block:
             with gr.Tabs():
                                                                    value=kwargs['stream_output'])
                             prompt_type = gr.Dropdown(prompt_types_strings,
                                                       value=kwargs['prompt_type'], label="Prompt Type",
+                                                      visible=not is_public)
                             temperature = gr.Slider(minimum=0, maximum=3,
                                                     value=kwargs['temperature'],
                                                     label="Temperature",
                                 value=kwargs['top_k'], label="Top k",
                                 info='Num. tokens to sample from'
                             )
+                            max_beams = 8 if not is_low_mem else 2
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                                                        "Uses more GPU memory/compute")
+                            max_max_new_tokens = 2048 if not is_low_mem else kwargs['max_new_tokens']
                             max_new_tokens = gr.Slider(
                                 minimum=1, maximum=max_max_new_tokens, step=1,
                                 value=min(max_max_new_tokens, kwargs['max_new_tokens']), label="Max output length",
                             )
                             early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                          value=kwargs['early_stopping'])
+                            max_max_time = 60 * 5 if not is_low_mem else 60
                             max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
                                                  value=min(max_max_time, kwargs['max_time']), label="Max. time",
                                                  info="Max. time to search optimal output.")
                             num_return_sequences = gr.Slider(minimum=1, maximum=10, step=1,
                                                              value=kwargs['num_return_sequences'],
                                                              label="Number Returns", info="Must be <= num_beams",
+                                                             visible=not is_public)
                             do_sample = gr.Checkbox(label="Sample", info="Sample, for diverse output(s)",
                                                     value=kwargs['do_sample'])
                             if kwargs['chat']:
                                 iinput = gr.Textbox(lines=4, label="Input",
                                                     placeholder=kwargs['placeholder_input'],
+                                                    visible=not is_public)
                             # nominally empty for chat mode
                             context = gr.Textbox(lines=1, label="Context",
                                                  info="Ignored in chat mode.",
+                                                 visible=not is_public)
                 with gr.TabItem("Models"):
                     with gr.Row():
                                     model_choice = gr.Dropdown(model_options_state.value[0], label="Choose Model", value=kwargs['base_model'])
                                     lora_choice = gr.Dropdown(lora_options_state.value[0], label="Choose LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                 with gr.Column(scale=1):
+                                    load_msg = "Load Model/LORA" if not is_public \
+                                        else "LOAD DISABLED FOR HOSTED DEMO"
                                     load_model_button = gr.Button(load_msg)
                                     model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'], visible=kwargs['show_lora'])
                                     add_model_button = gr.Button("Add new model name")
                                     add_lora_button = gr.Button("Add new LORA name", visible=kwargs['show_lora'])
                 with gr.TabItem("System"):
+                    system_row = gr.Row(visible=not is_public)
+                    admin_pass_textbox = gr.Textbox(label="Admin Password", type='password', visible=is_public)
+                    admin_btn = gr.Button(value="admin", visible=is_public)
+                    with system_row:
                         with gr.Column():
                             system_text = gr.Textbox(label='System Info')
                             system_btn = gr.Button(value='Get System Info')
+                            zip_btn = gr.Button("Zip")
+                            file_output = gr.File()
+        # Get flagged data
+        zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
+        zip_btn.click(zip_data1, inputs=None, outputs=file_output)
+        def check_admin_pass(x):
+            return gr.update(visible=x == admin_pass)
+        admin_btn.click(check_admin_pass, inputs=admin_pass_textbox, outputs=system_row)
+        # Get inputs to evaluate()
         inputs_list = get_inputs_list(locals(), kwargs['model_lower'])
         from functools import partial
         all_kwargs = kwargs.copy()
                     len(history[-1]) >= 2:
                 os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+                max_length_tokenize = 512 if is_low_mem else 2048
                 cutoff_len = max_length_tokenize*4  # restrict deberta related to max for LLM
                 question = history[-1][0]
                     clear_torch_cache()
                     return 'Response Score: GPU OOM'
                 except RuntimeError as e:
+                    if 'Expected all tensors to be on the same device' in str(e) or \
+                            'expected scalar type Half but found Float' in str(e) or \
+                            'probability tensor contains either' in str(e):
                         print("GPU Error: question: %s answer: %s exception: %s" % (question, answer, str(e)), flush=True)
                         traceback.print_exc()
                         clear_torch_cache()
                                outputs=[model_state, model_used, lora_used, prompt_type])
         prompt_update_args = dict(fn=dropdown_prompt_type_list, inputs=prompt_type, outputs=prompt_type)
         chatbot_update_args = dict(fn=chatbot_list, inputs=[text_output, model_used], outputs=text_output)
+        if not is_public:
             load_model_event = load_model_button.click(**load_model_args) \
                                                  .then(**prompt_update_args) \
                                                  .then(**chatbot_update_args) \
 input_args_list = ['model_state']
+inputs_kwargs_list = ['debug', 'chat', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0']
 def get_inputs_list(inputs_dict, model_lower):
         src_lang=None,
         tgt_lang=None,
         debug=False,
+        save_dir=None,
         chat=False,
         hard_stop_list=None,
         sanitize_bot_response=True,
             # encounters = [prompt.count(human) + 1, prompt.count(bot) + 1]
             # stopping only starts once output is beyond prompt
             # 1 human is enough to trigger, but need 2 bots, because very first view back will be bot we added
+            stop_words = [human, bot, '\n' + human, '\n' + bot]
             encounters = [1, 2]
         elif prompt_type == 'instruct_vicuna':
             # even below is not enough, generic strings and many ways to encode
         # avoid padding in front of tokens
         if tokenizer.pad_token:
             stop_words_ids = [x[1:] if x[0] == tokenizer.pad_token_id and len(x) > 1 else x for x in stop_words_ids]
+        # handle fake \n added
+        stop_words_ids = [x[1:] if y[0] == '\n' else x for x,y in zip(stop_words_ids, stop_words)]
+        # build stopper
         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids, encounters=encounters)])
     else:
         stopping_criteria = StoppingCriteriaList()
     # RuntimeError: The size of tensor a (2048) must match the size of tensor b (2049) at non-singleton dimension 3
     # RuntimeError: expected scalar type Half but found Float
     # with - 256
+    max_length_tokenize = 768 - 256 if is_low_mem else 2048 - 256
     cutoff_len = max_length_tokenize * 4  # if reaches limit, then can't generate new tokens
     output_smallest = 30 * 4
     prompt = prompt[-cutoff_len - output_smallest:]
                     clear_torch_cache()
                     return
                 except RuntimeError as e:
+                    if 'Expected all tensors to be on the same device' in str(e) or \
+                            'expected scalar type Half but found Float' in str(e) or \
+                            'probability tensor contains either' in str(e):
                         print(
                             "GPU Error: prompt: %s inputs_decoded: %s exception: %s" % (prompt, inputs_decoded, str(e)),
                             flush=True)
                     else:
                         raise
+            decoded_output = None
             for output in CallbackToGenerator(generate, callback=None, **gen_kwargs):
                 decoded_output = decoder(output)
                 if output[-1] in [tokenizer.eos_token_id]:
                     raise StopIteration
                 yield prompter.get_response(decoded_output, prompt=inputs_decoded,
                                             sanitize_bot_response=sanitize_bot_response)
+            if save_dir and decoded_output:
+                save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
         else:
             outputs = model.generate(**gen_kwargs)
             outputs = [decoder(s) for s in outputs.sequences]
             yield prompter.get_response(outputs, prompt=inputs_decoded,
                                         sanitize_bot_response=sanitize_bot_response)
+            if save_dir and outputs and len(outputs) >= 1:
+                decoded_output = prompt + outputs[0]
+                save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
 def get_generate_params(model_lower, chat,
     python generate.py --base_model='togethercomputer/GPT-NeoXT-Chat-Base-20B' --prompt_type='human_bot' --lora_weights='GPT-NeoXT-Chat-Base-20B.merged.json.8_epochs.57b2892c53df5b8cefac45f84d019cace803ef26.28'
+    must have 4*48GB GPU and run without 8bit in order for sharding to work with infer_devices=False
+    can also pass --prompt_type='human_bot' and model can somewhat handle instructions without being instruct tuned
+    python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
+    python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-256-6.9b
     """, flush=True)
     fire.Fire(main)

finetune.py CHANGED Viewed

@@ -73,6 +73,7 @@ prompt_type_to_model_name = {
         'decapoda-research/llama-7b-hf',
         'decapoda-research/llama-13b-hf',
         'decapoda-research/llama-30b-hf',
         'facebook/mbart-large-50-many-to-many-mmt',
         'philschmid/bart-large-cnn-samsum',
         'philschmid/flan-t5-base-samsum',
@@ -120,7 +121,10 @@ def train(
         save_code: bool = False,
         run_id: int = None,
-        base_model: str = 'EleutherAI/gpt-neox-20b',
         # base_model: str = 'EleutherAI/pythia-12b-deduped',
         # base_model: str = 'togethercomputer/GPT-NeoXT-Chat-Base-20B',
         # base_model: str = 'decapoda-research/llama-7b-hf',

         'decapoda-research/llama-7b-hf',
         'decapoda-research/llama-13b-hf',
         'decapoda-research/llama-30b-hf',
+        'decapoda-research/llama-65b-hf',
         'facebook/mbart-large-50-many-to-many-mmt',
         'philschmid/bart-large-cnn-samsum',
         'philschmid/flan-t5-base-samsum',
         save_code: bool = False,
         run_id: int = None,
+        base_model: str = 'h2oai/h2ogpt-oig-oasst1-256-6.9b',
+        # base_model: str = 'h2oai/h2ogpt-oasst1-512-12b',
+        # base_model: str = 'h2oai/h2ogpt-oasst1-512-20b',
+        # base_model: str = 'EleutherAI/gpt-neox-20b',
         # base_model: str = 'EleutherAI/pythia-12b-deduped',
         # base_model: str = 'togethercomputer/GPT-NeoXT-Chat-Base-20B',
         # base_model: str = 'decapoda-research/llama-7b-hf',

utils.py CHANGED Viewed

@@ -1,7 +1,13 @@
 import os
 import gc
 import random
 import time
 import numpy as np
 import pandas as pd
 import torch
@@ -87,3 +93,59 @@ def system_info_print():
         return df.to_markdown()
     except Exception as e:
         return "Error: %s" % str(e)

+import contextlib
 import os
 import gc
 import random
+import shutil
 import time
+import traceback
+import zipfile
+import filelock
 import numpy as np
 import pandas as pd
 import torch
         return df.to_markdown()
     except Exception as e:
         return "Error: %s" % str(e)
+def zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
+    try:
+        return _zip_data(zip_path=zip_path, base_dir=base_dir, root_dirs=root_dirs)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception in zipping: %s' % str(e))
+def _zip_data(root_dirs=None, zip_path='data.zip', base_dir='./'):
+    assert root_dirs is not None
+    with zipfile.ZipFile(zip_path, "w") as expt_zip:
+        for root_dir in root_dirs:
+            if root_dir is None:
+                continue
+            for root, d, files in os.walk(root_dir):
+                for file in files:
+                    file_to_archive = os.path.join(root, file)
+                    assert os.path.exists(file_to_archive)
+                    path_to_archive = os.path.relpath(file_to_archive, base_dir)
+                    expt_zip.write(filename=file_to_archive, arcname=path_to_archive)
+    return "data.zip"
+def save_generate_output(output=None, base_model=None, save_dir=None):
+    try:
+        return _save_generate_output(output=output, base_model=base_model, save_dir=save_dir)
+    except Exception as e:
+        traceback.print_exc()
+        print('Exception in saving: %s' % str(e))
+def _save_generate_output(output=None, base_model=None, save_dir=None):
+    """
+    Save conversation to .json, row by row.
+    json_file_path is path to final JSON file. If not in ., then will attempt to make directories.
+    Appends if file exists
+    """
+    assert save_dir, "save_dir must be provided"
+    if os.path.exists(save_dir) and not os.path.isdir(save_dir):
+        raise RuntimeError("save_dir already exists and is not a directory!")
+    os.makedirs(save_dir, exist_ok=True)
+    import json
+    if output[-10:] == '\n\n<human>:':
+        # remove trailing <human>:
+        output = output[:-10]
+    with filelock.FileLock("save_dir.lock"):
+        # lock logging in case have concurrency
+        with open(os.path.join(save_dir, "history.json"), "a") as f:
+            # just add [ at start, and ] at end, and have proper JSON dataset
+            f.write(
+                "  " + json.dumps(
+                    dict(text=output, time=time.ctime(), base_model=base_model)
+                ) + ",\n"
+            )