h2ogpt-chatbot2

Runtime error

App Files Files Community

pseudotensor commited on May 3, 2023

Commit

9aa08b9

1 Parent(s): 9e9d047

Update with h2oGPT hash d5a4556404029122394e3b1c0a4ea97d8c996bb6

Browse files

Files changed (3) hide show

generate.py +187 -73
gradio_runner.py +107 -84
utils.py +34 -0

generate.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import functools
 import sys
 import os
 import traceback
 import typing
-from threading import Thread
 from datetime import datetime
 import filelock
 import psutil
-from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial
 SEED = 1236
 set_seed(SEED)
@@ -35,11 +36,11 @@ eval_extra_columns = ['prompt', 'response', 'score']
 def main(
         load_8bit: bool = False,
         load_half: bool = True,
-        infer_devices: bool = True,  # really if to "control" devices now
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
-        gpu_id: int = 0,  # if infer_devices = True and gpu_id != -1
         prompt_type: Union[int, str] = None,
         # input to generation
@@ -60,7 +61,7 @@ def main(
         share: bool = True,
         local_files_only: bool = False,
         resume_download: bool = True,
-        use_auth_token: Union[str, bool] = False,  # True requires CLI did huggingface-cli login before running
         src_lang: str = "English",
         tgt_lang: str = "Russian",
@@ -68,20 +69,18 @@ def main(
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
-        chat_history: int = 4096,  # character length of chat context/history
-        chat_context: bool = False,  # use default context if human_bot
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
         h2ocolors: bool = True,
         height: int = 400,
         show_lora: bool = True,
-        # set to True to load --base_model after client logs in,
-        # to be able to free GPU memory when model is swapped
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
         concurrency_count: int = 1,
-        api_open: bool = False,  # don't let API skip queue
         allow_api: bool = True,
         input_lines: int = 1,
@@ -97,9 +96,64 @@ def main(
         eval_sharegpt_prompts_only: int = 0,
         eval_sharegpt_prompts_only_seed: int = 1234,
         eval_sharegpt_as_output: bool = False,
-        hard_stop_list: typing.List[str] = [],
 ):
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
     is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
@@ -107,7 +161,7 @@ def main(
     admin_pass = os.getenv("ADMIN_PASS")
     # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
     # but becomes unrecoverable sometimes if raise, so just be silent for now
-    raise_generate_gpu_exceptions = not is_public
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
@@ -223,9 +277,10 @@ def main(
         eval_filename = os.path.join(scoring_path, eval_filename)
         # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
-        context_class = NullContext() if n_gpus > 1 or n_gpus == 0 else torch.device("cuda")
-        with context_class:
             # ensure was set right above before examples generated
             assert not stream_output, "stream_output=True does not make sense with example loop"
             import time
@@ -240,7 +295,8 @@ def main(
                 fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir, is_low_mem=is_low_mem,
                               raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
                               chat_context=chat_context,
-                              concurrency_count=concurrency_count)
             else:
                 assert eval_sharegpt_prompts_only > 0
@@ -288,7 +344,7 @@ def main(
                                             truncation=True,
                                             max_length=cutoff_len)
                         try:
-                            score = torch.sigmoid(smodel(**inputs).logits[0]).cpu().detach().numpy()[0]
                         except torch.cuda.OutOfMemoryError as e:
                             print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), flush=True)
                             traceback.print_exc()
@@ -649,12 +705,12 @@ def evaluate(
         debug=False,
         concurrency_count=None,
         save_dir=None,
-        hard_stop_list=None,
         sanitize_bot_response=True,
         model_state0=None,
         is_low_mem=None,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
 ):
     # ensure passed these
     assert concurrency_count is not None
@@ -710,10 +766,6 @@ def evaluate(
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
-    if hard_stop_list is None:
-        # acts like undo on user entry and bot response
-        hard_stop_list = []
     if isinstance(tokenizer, str):
         # pipeline
         if tokenizer == "summarization":
@@ -829,55 +881,115 @@ def evaluate(
                                     )
     with torch.no_grad():
-        # protection for gradio not keeping track of closed users,
-        # else hit bitsandbytes lack of thread safety:
-        # https://github.com/h2oai/h2ogpt/issues/104
-        # but only makes sense if concurrency_count == 1
-        context_class = NullContext #if concurrency_count > 1 else filelock.FileLock
-        print('Pre-Generate: %s' % str(datetime.now()), flush=True)
-        decoded_output = None
-        with context_class("generate.lock"):
-            print('Generate: %s' % str(datetime.now()), flush=True)
-            # decoded tokenized prompt can deviate from prompt due to special characters
-            inputs_decoded = decoder(input_ids[0])
-            inputs_decoded_raw = decoder_raw(input_ids[0])
-            if inputs_decoded == prompt:
-                # normal
-                pass
-            elif inputs_decoded.lstrip() == prompt.lstrip():
-                # sometimes extra space in front, make prompt same for prompt removal
-                prompt = inputs_decoded
-            elif inputs_decoded_raw == prompt:
-                # some models specify special tokens that are part of normal prompt, so can't skip them
-                inputs_decoded_raw = inputs_decoded
-                decoder = decoder_raw
-            else:
-                print("WARNING: Special characters in prompt", flush=True)
-            if stream_output:
-                skip_prompt = False
-                streamer = TextIteratorStreamer(tokenizer, skip_prompt=skip_prompt)
-                gen_kwargs.update(dict(streamer=streamer))
-                target_func = generate_with_exceptions
-                target = wrapped_partial(generate_with_exceptions, model.generate, prompt, inputs_decoded,
-                                         raise_generate_gpu_exceptions, **gen_kwargs)
-                thread = Thread(target=target)
-                thread.start()
-                outputs = ""
-                for new_text in streamer:
-                    outputs += new_text
                     yield prompter.get_response(outputs, prompt=inputs_decoded,
                                                 sanitize_bot_response=sanitize_bot_response)
-                decoded_output = outputs
-            else:
-                outputs = model.generate(**gen_kwargs)
-                outputs = [decoder(s) for s in outputs.sequences]
-                yield prompter.get_response(outputs, prompt=inputs_decoded,
-                                            sanitize_bot_response=sanitize_bot_response)
-                if outputs and len(outputs) >= 1:
-                    decoded_output = prompt + outputs[0]
-            if save_dir and decoded_output:
-                save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
-        print('Post-Generate: %s decoded_output: %s' % (str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
 def generate_with_exceptions(func, prompt, inputs_decoded, raise_generate_gpu_exceptions, **kwargs):
@@ -908,7 +1020,8 @@ def generate_with_exceptions(func, prompt, inputs_decoded, raise_generate_gpu_ex
             return
         else:
             clear_torch_cache()
-            raise
 def get_generate_params(model_lower, chat,
@@ -1154,7 +1267,9 @@ def score_qa(smodel, stokenizer, max_length_tokenize, question, answer, cutoff_l
 if __name__ == "__main__":
-    print("""
     WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
     python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
     python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
@@ -1180,6 +1295,5 @@ if __name__ == "__main__":
     python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
     python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
-    """, flush=True)
     fire.Fire(main)

 import functools
+import queue
 import sys
 import os
+import time
 import traceback
 import typing
 from datetime import datetime
 import filelock
 import psutil
+from utils import set_seed, clear_torch_cache, save_generate_output, NullContext, wrapped_partial, EThread
 SEED = 1236
 set_seed(SEED)
 def main(
         load_8bit: bool = False,
         load_half: bool = True,
+        infer_devices: bool = True,
         base_model: str = '',
         tokenizer_base_model: str = '',
         lora_weights: str = "",
+        gpu_id: int = 0,
         prompt_type: Union[int, str] = None,
         # input to generation
         share: bool = True,
         local_files_only: bool = False,
         resume_download: bool = True,
+        use_auth_token: Union[str, bool] = False,
         src_lang: str = "English",
         tgt_lang: str = "Russian",
         gradio: bool = True,
         gradio_avoid_processing_markdown: bool = False,
         chat: bool = True,
+        chat_history: int = 4096,
+        chat_context: bool = False,
         stream_output: bool = True,
         show_examples: bool = None,
         verbose: bool = False,
         h2ocolors: bool = True,
         height: int = 400,
         show_lora: bool = True,
         login_mode_if_model0: bool = False,
         block_gradio_exit: bool = True,
         concurrency_count: int = 1,
+        api_open: bool = False,
         allow_api: bool = True,
         input_lines: int = 1,
         eval_sharegpt_prompts_only: int = 0,
         eval_sharegpt_prompts_only_seed: int = 1234,
         eval_sharegpt_as_output: bool = False,
 ):
+    """
+    :param load_8bit: load model in 8-bit using bitsandbytes
+    :param load_half: load model in float16
+    :param infer_devices: whether to control devices with gpu_id.  If False, then spread across GPUs
+    :param base_model: model HF-type name
+    :param tokenizer_base_model: tokenizer HF-type name
+    :param lora_weights: LORA weights path/HF link
+    :param gpu_id: if infer_devices, then use gpu_id for cuda device ID, or auto mode if gpu_id != -1
+    :param prompt_type: type of prompt, usually matched to fine-tuned model or plain for foundational model
+    :param temperature: generation temperature
+    :param top_p: generation top_p
+    :param top_k: generation top_k
+    :param num_beams: generatino number of beams
+    :param repetition_penalty: generation repetition penalty
+    :param num_return_sequences: generation number of sequences (1 forced for chat)
+    :param do_sample: generation sample
+    :param max_new_tokens: generation max new tokens
+    :param min_new_tokens: generation min tokens
+    :param early_stopping: generation early stopping
+    :param max_time: maximum time to allow for generation
+    :param debug: enable debug mode
+    :param save_dir: directory chat data is saved to
+    :param share: whether to share the gradio app with sharable URL
+    :param local_files_only: whether to only use local files instead of doing to HF for models
+    :param resume_download: whether to resume downloads from HF for models
+    :param use_auth_token: whether to use HF auth token (requires CLI did huggingface-cli login before)
+    :param src_lang: source languages to include if doing translation (None = all)
+    :param tgt_lang: target languages to include if doing translation (None = all)
+    :param gradio: whether to enable gradio, or to enable benchmark mode
+    :param gradio_avoid_processing_markdown:
+    :param chat: whether to enable chat mode with chat history
+    :param chat_history: maximum character length of chat context/history
+    :param chat_context: whether to use extra helpful context if human_bot
+    :param stream_output: whether to stream output from generate
+    :param show_examples: whether to show clickable examples in gradio
+    :param verbose: whether to show verbose prints
+    :param h2ocolors: whether to use H2O.ai theme
+    :param height: height of chat window
+    :param show_lora: whether to show LORA options in UI (expert so can be hard to understand)
+    :param login_mode_if_model0: set to True to load --base_model after client logs in, to be able to free GPU memory when model is swapped
+    :param block_gradio_exit: whether to block gradio exit (used for testing)
+    :param concurrency_count: gradio concurrency count (1 is optimal for LLMs)
+    :param api_open: If False, don't let API calls skip gradio queue
+    :param allow_api: whether to allow API calls at all to gradio server
+    :param input_lines: how many input lines to show for chat box (>1 forces shift-enter for submit, else enter is submit)
+    :param sanitize_user_prompt: whether to remove profanity from user input
+    :param sanitize_bot_response: whether to remove profanity and repeat lines from bot output
+    :param extra_model_options: extra models to show in list in gradio
+    :param extra_lora_options: extra LORAA to show in list in gradio
+    :param score_model: which model to score responses (None means no scoring)
+    :param auto_score: whether to automatically score responses
+    :param eval_sharegpt_prompts_only: for no gradio benchmark, if using ShareGPT prompts for eval
+    :param eval_sharegpt_prompts_only_seed: for no gradio benchmark, if seed for ShareGPT sampling
+    :param eval_sharegpt_as_output: for no gradio benchmark, whether to test ShareGPT output itself
+    :return:
+    """
     is_hf = bool(os.getenv("HUGGINGFACE_SPACES"))
     is_gpth2oai = bool(os.getenv("GPT_H2O_AI"))
     is_public = is_hf or is_gpth2oai  # multi-user case with fixed model and disclaimer
     admin_pass = os.getenv("ADMIN_PASS")
     # will sometimes appear in UI or sometimes actual generation, but maybe better than empty result
     # but becomes unrecoverable sometimes if raise, so just be silent for now
+    raise_generate_gpu_exceptions = True
     # allow set token directly
     use_auth_token = os.environ.get("HUGGINGFACE_API_TOKEN", use_auth_token)
         eval_filename = os.path.join(scoring_path, eval_filename)
         # torch.device("cuda") leads to cuda:x cuda:y mismatches for multi-GPU consistently
+        device = 'cpu' if n_gpus == 0 else 'cuda'
+        context_class = NullContext if n_gpus > 1 or n_gpus == 0 else torch.device
+        with context_class(device):
             # ensure was set right above before examples generated
             assert not stream_output, "stream_output=True does not make sense with example loop"
             import time
                 fun = partial(evaluate, model_state, debug=debug, save_dir=save_dir, is_low_mem=is_low_mem,
                               raise_generate_gpu_exceptions=raise_generate_gpu_exceptions,
                               chat_context=chat_context,
+                              concurrency_count=concurrency_count,
+                              lora_weights=lora_weights)
             else:
                 assert eval_sharegpt_prompts_only > 0
                                             truncation=True,
                                             max_length=cutoff_len)
                         try:
+                            score = torch.sigmoid(smodel(**inputs).logits[0].float()).cpu().detach().numpy()[0]
                         except torch.cuda.OutOfMemoryError as e:
                             print("GPU OOM 1: question: %s answer: %s exception: %s" % (prompt, res, str(e)), flush=True)
                             traceback.print_exc()
         debug=False,
         concurrency_count=None,
         save_dir=None,
         sanitize_bot_response=True,
         model_state0=None,
         is_low_mem=None,
         raise_generate_gpu_exceptions=None,
         chat_context=None,
+        lora_weights=None,
 ):
     # ensure passed these
     assert concurrency_count is not None
     prompter = Prompter(prompt_type, debug=debug, chat=chat, stream_output=stream_output)
     prompt = prompter.generate_prompt(data_point)
     if isinstance(tokenizer, str):
         # pipeline
         if tokenizer == "summarization":
                                     )
     with torch.no_grad():
+        context_class_cast = NullContext if device == 'cpu' or lora_weights else torch.autocast
+        with context_class_cast(device):
+            # protection for gradio not keeping track of closed users,
+            # else hit bitsandbytes lack of thread safety:
+            # https://github.com/h2oai/h2ogpt/issues/104
+            # but only makes sense if concurrency_count == 1
+            context_class = NullContext #if concurrency_count > 1 else filelock.FileLock
+            print('Pre-Generate: %s' % str(datetime.now()), flush=True)
+            decoded_output = None
+            with context_class("generate.lock"):
+                print('Generate: %s' % str(datetime.now()), flush=True)
+                # decoded tokenized prompt can deviate from prompt due to special characters
+                inputs_decoded = decoder(input_ids[0])
+                inputs_decoded_raw = decoder_raw(input_ids[0])
+                if inputs_decoded == prompt:
+                    # normal
+                    pass
+                elif inputs_decoded.lstrip() == prompt.lstrip():
+                    # sometimes extra space in front, make prompt same for prompt removal
+                    prompt = inputs_decoded
+                elif inputs_decoded_raw == prompt:
+                    # some models specify special tokens that are part of normal prompt, so can't skip them
+                    inputs_decoded_raw = inputs_decoded
+                    decoder = decoder_raw
+                else:
+                    print("WARNING: Special characters in prompt", flush=True)
+                if stream_output:
+                    skip_prompt = False
+                    streamer = H2OTextIteratorStreamer(tokenizer, skip_prompt=skip_prompt, block=False)
+                    gen_kwargs.update(dict(streamer=streamer))
+                    target_func = generate_with_exceptions
+                    target = wrapped_partial(generate_with_exceptions, model.generate, prompt, inputs_decoded,
+                                             raise_generate_gpu_exceptions, **gen_kwargs)
+                    bucket = queue.Queue()
+                    thread = EThread(target=target, kwargs=dict(streamer=streamer), bucket=bucket)
+                    thread.start()
+                    outputs = ""
+                    try:
+                        for new_text in streamer:
+                            if bucket.qsize() > 0 or thread.exc:
+                                thread.join()
+                            outputs += new_text
+                            yield prompter.get_response(outputs, prompt=inputs_decoded,
+                                                        sanitize_bot_response=sanitize_bot_response)
+                    except BaseException:
+                        # if any exception, raise that exception if was from thread, first
+                        if thread.exc:
+                            raise thread.exc
+                        raise
+                    finally:
+                        # in case no exception and didn't join with thread yet, then join
+                        if not thread.exc:
+                            thread.join()
+                    # in case raise StopIteration or broke queue loop in streamer, but still have exception
+                    if thread.exc:
+                        raise thread.exc
+                    decoded_output = outputs
+                else:
+                    outputs = model.generate(**gen_kwargs)
+                    outputs = [decoder(s) for s in outputs.sequences]
                     yield prompter.get_response(outputs, prompt=inputs_decoded,
                                                 sanitize_bot_response=sanitize_bot_response)
+                    if outputs and len(outputs) >= 1:
+                        decoded_output = prompt + outputs[0]
+                if save_dir and decoded_output:
+                    save_generate_output(output=decoded_output, base_model=base_model, save_dir=save_dir)
+            print('Post-Generate: %s decoded_output: %s' % (str(datetime.now()), len(decoded_output) if decoded_output else -1), flush=True)
+class H2OTextIteratorStreamer(TextIteratorStreamer):
+    """
+    normally, timeout required for now to handle exceptions, else get()
+    but with H2O version of TextIteratorStreamer, loop over block to handle
+    """
+    def __init__(self, tokenizer, skip_prompt: bool = False, timeout: typing.Optional[float] = None,
+                 block=True, **decode_kwargs):
+        super().__init__(tokenizer, skip_prompt, **decode_kwargs)
+        self.text_queue = queue.Queue()
+        self.stop_signal = None
+        self.do_stop = False
+        self.timeout = timeout
+        self.block = block
+    def on_finalized_text(self, text: str, stream_end: bool = False):
+        """Put the new text in the queue. If the stream is ending, also put a stop signal in the queue."""
+        self.text_queue.put(text, timeout=self.timeout)
+        if stream_end:
+            self.text_queue.put(self.stop_signal, timeout=self.timeout)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        while True:
+            try:
+                value = self.stop_signal  # value looks unused in pycharm, not true
+                if self.do_stop:
+                    print("hit stop", flush=True)
+                    # could raise or break, maybe best to raise and make parent see if any exception in thread
+                    raise StopIteration()
+                    #break
+                value = self.text_queue.get(block=self.block, timeout=self.timeout)
+                break
+            except queue.Empty:
+                time.sleep(0.01)
+        if value == self.stop_signal:
+            raise StopIteration()
+        else:
+            return value
 def generate_with_exceptions(func, prompt, inputs_decoded, raise_generate_gpu_exceptions, **kwargs):
             return
         else:
             clear_torch_cache()
+            if raise_generate_gpu_exceptions:
+                raise
 def get_generate_params(model_lower, chat,
 if __name__ == "__main__":
+    """
+    Examples:
     WORLD_SIZE=4 CUDA_VISIBLE_DEVICES="0,1,2,3" torchrun --nproc_per_node=4 --master_port=1234 generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights=lora-alpaca_6B
     python generate.py --base_model='EleutherAI/gpt-j-6B' --lora_weights='lora-alpaca_6B'
     python generate.py --base_model='EleutherAI/gpt-neox-20b' --lora_weights='lora-alpaca_20B'
     python generate.py --base_model=decapoda-research/llama-65b-hf --load_8bit=False --infer_devices=False --prompt_type='human_bot'
     python generate.py --base_model=h2oai/h2ogpt-oig-oasst1-512-6.9b
+    """
     fire.Fire(main)

gradio_runner.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import functools
 import inspect
 import os
@@ -246,7 +247,11 @@ def go_gradio(**kwargs):
                                 value=kwargs['top_k'], label="Top k",
                                 info='Num. tokens to sample from'
                             )
-                            max_beams = 8 if not is_low_mem else 1
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
@@ -262,7 +267,9 @@ def go_gradio(**kwargs):
                             )
                             early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                          value=kwargs['early_stopping'])
-                            max_max_time = 60 * 5 if not is_low_mem else 60
                             max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
                                                  value=min(max_max_time, kwargs['max_time']), label="Max. time",
                                                  info="Max. time to search optimal output.")
@@ -309,9 +316,10 @@ def go_gradio(**kwargs):
                                     model_gpu = gr.Dropdown(n_gpus_list,
                                                             label="GPU ID 2 [-1 = all GPUs, if Choose is enabled]",
                                                             value=kwargs['gpu_id'])
-                                    model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'])
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
-                                                           visible=kwargs['show_lora'])
                             with gr.Row():
                                 with gr.Column(scale=50):
                                     new_model = gr.Textbox(label="New Model HF name/path")
@@ -354,15 +362,15 @@ def go_gradio(**kwargs):
                         with gr.Column():
                             with gr.Row():
                                 system_btn = gr.Button(value='Get System Info')
-                                system_text = gr.Textbox(label='System Info')
                             with gr.Row():
                                 zip_btn = gr.Button("Zip")
-                                zip_text = gr.Textbox(label="Zip file name")
                                 file_output = gr.File()
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
-                                s3up_text = gr.Textbox(label='S3UP result')
         # Get flagged data
         zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
@@ -395,12 +403,15 @@ def go_gradio(**kwargs):
         dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
             size="sm",
         )
         dark_mode_btn.click(
             None,
             None,
             None,
             _js=get_dark_js(),
             api_name="dark" if allow_api else None,
         )
         # Control chat and non-chat blocks, which can be independently used by chat checkbox swap
@@ -415,7 +426,8 @@ def go_gradio(**kwargs):
         chat.select(col_nochat_fun, chat, col_nochat, api_name="chat_checkbox" if allow_api else None) \
             .then(col_chat_fun, chat, col_chat) \
-            .then(context_fun, chat, context)
         # examples after submit or any other buttons for chat or no chat
         if kwargs['examples'] is not None and kwargs['show_examples']:
@@ -514,6 +526,10 @@ def go_gradio(**kwargs):
             if sanitize_user_prompt:
                 from better_profanity import profanity
                 user_message1 = profanity.censor(user_message1)
             history = args_list[-1]
             if undo and history:
@@ -541,15 +557,17 @@ def go_gradio(**kwargs):
             :param retry:
             :return:
             """
-            args_list = list(args).copy()
             history = args_list[-1]  # model_state is -2
             if retry and history:
                 history.pop()
             if not history:
                 print("No history", flush=True)
                 return
             # ensure output will be unique to models
-            history = history.copy()
             instruction1 = history[-1][0]
             context1 = ''
             if kwargs['chat_history'] > 0:
@@ -571,6 +589,8 @@ def go_gradio(**kwargs):
             args_list[2] = context1[-kwargs['chat_history']:]
             model_state1 = args_list[-2]
             if model_state1[0] is None or model_state1[0] == no_model_str:
                 return
             args_list = args_list[:-2]
             fun1 = partial(evaluate,
@@ -580,19 +600,25 @@ def go_gradio(**kwargs):
                 for output in fun1(*tuple(args_list)):
                     bot_message = output
                     history[-1][1] = bot_message
-                    yield history
             except StopIteration:
-                yield history
             except RuntimeError as e:
                 if "generator raised StopIteration" in str(e):
                     # assume last entry was bad, undo
                     history.pop()
-                    yield history
-                raise
             except Exception as e:
                 # put error into user input
-                history[-1][0] = "Exception: %s" % str(e)
-                yield history
                 raise
             return
@@ -603,11 +629,11 @@ def go_gradio(**kwargs):
                          )
         bot_args = dict(fn=bot,
                         inputs=inputs_list + [model_state] + [text_output],
-                        outputs=text_output,
                         )
         retry_bot_args = dict(fn=functools.partial(bot, retry=True),
                               inputs=inputs_list + [model_state] + [text_output],
-                              outputs=text_output,
                               )
         undo_user_args = dict(fn=functools.partial(user, undo=True),
                               inputs=inputs_list + [text_output],
@@ -621,11 +647,11 @@ def go_gradio(**kwargs):
                           )
         bot_args2 = dict(fn=bot,
                          inputs=inputs_list + [model_state2] + [text_output2],
-                         outputs=text_output2,
                          )
         retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
                                inputs=inputs_list + [model_state2] + [text_output2],
-                               outputs=text_output2,
                                )
         undo_user_args2 = dict(fn=functools.partial(user, undo=True),
                                inputs=inputs_list + [text_output2],
@@ -636,67 +662,61 @@ def go_gradio(**kwargs):
             return gr.Textbox.update(value='')
         if kwargs['auto_score']:
-            # in case 2nd model, consume instruction first, so can clear quickly
-            # bot doesn't consume instruction itself, just history from user, so why works
-            submit_event = instruction.submit(**user_args, queue=queue,
-                                              api_name='instruction' if allow_api else None) \
-                .then(**user_args2, api_name='instruction2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**bot_args, api_name='instruction_bot' if allow_api else None, queue=queue) \
-                .then(**score_args, api_name='instruction_bot_score' if allow_api else None, queue=queue) \
-                .then(**bot_args2, api_name='instruction_bot2' if allow_api else None, queue=queue) \
-                .then(**score_args2, api_name='instruction_bot_score2' if allow_api else None, queue=queue) \
-                .then(clear_torch_cache)
-            submit_event2 = submit.click(**user_args, api_name='submit' if allow_api else None) \
-                .then(**user_args2, api_name='submit2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**bot_args, api_name='submit_bot' if allow_api else None, queue=queue) \
-                .then(**score_args, api_name='submit_bot_score' if allow_api else None, queue=queue) \
-                .then(**bot_args2, api_name='submit_bot2' if allow_api else None, queue=queue) \
-                .then(**score_args2, api_name='submit_bot_score2' if allow_api else None, queue=queue) \
-                .then(clear_torch_cache)
-            submit_event3 = retry.click(**user_args, api_name='retry' if allow_api else None) \
-                .then(**user_args2, api_name='retry2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**retry_bot_args, api_name='retry_bot' if allow_api else None, queue=queue) \
-                .then(**score_args, api_name='retry_bot_score' if allow_api else None, queue=queue) \
-                .then(**retry_bot_args2, api_name='retry_bot2' if allow_api else None, queue=queue) \
-                .then(**score_args2, api_name='retry_bot_score2' if allow_api else None, queue=queue) \
-                .then(clear_torch_cache)
-            submit_event4 = undo.click(**undo_user_args, api_name='undo' if allow_api else None) \
-                .then(**undo_user_args2, api_name='undo2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**score_args, api_name='undo_score' if allow_api else None) \
-                .then(**score_args2, api_name='undo_score2' if allow_api else None)
         else:
-            submit_event = instruction.submit(**user_args,
-                                              api_name='instruction' if allow_api else None) \
-                .then(**user_args2, api_name='instruction2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**bot_args, api_name='instruction_bot' if allow_api else None, queue=queue) \
-                .then(**bot_args2, api_name='instruction_bot2' if allow_api else None, queue=queue) \
-                .then(clear_torch_cache)
-            submit_event2 = submit.click(**user_args, api_name='submit' if allow_api else None) \
-                .then(**user_args2, api_name='submit2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**bot_args, api_name='submit_bot' if allow_api else None, queue=queue) \
-                .then(**bot_args2, api_name='submit_bot2' if allow_api else None, queue=queue) \
-                .then(clear_torch_cache)
-            submit_event3 = retry.click(**user_args, api_name='retry' if allow_api else None) \
-                .then(**user_args2, api_name='retry2' if allow_api else None) \
-                .then(clear_instruct, None, instruction) \
-                .then(clear_instruct, None, iinput) \
-                .then(**retry_bot_args, api_name='retry_bot' if allow_api else None, queue=queue) \
-                .then(**retry_bot_args2, api_name='retry_bot2' if allow_api else None, queue=queue) \
-                .then(clear_torch_cache)
-            submit_event4 = undo.click(**undo_user_args, api_name='undo' if allow_api else None) \
-                .then(**undo_user_args2, api_name='undo2' if allow_api else None)
         # does both models
         clear.click(lambda: None, None, text_output, queue=False, api_name='clear' if allow_api else None) \
@@ -864,9 +884,12 @@ def go_gradio(**kwargs):
                                         api_name='system_info' if allow_api else None, queue=False)
         # don't pass text_output, don't want to clear output, just stop it
-        # FIXME: have to click once to stop output and second time to stop GPUs going
         stop_btn.click(lambda: None, None, None,
-                       cancels=[submit_event_nochat, submit_event, submit_event2, submit_event3],
                        queue=False, api_name='stop' if allow_api else None).then(clear_torch_cache, queue=False)
         demo.load(None, None, None, _js=get_dark_js() if kwargs['h2ocolors'] else None)
@@ -887,8 +910,8 @@ def go_gradio(**kwargs):
 input_args_list = ['model_state']
-inputs_kwargs_list = ['debug', 'save_dir', 'hard_stop_list', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
-                      'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count']
 def get_inputs_list(inputs_dict, model_lower):

+import copy
 import functools
 import inspect
 import os
                                 value=kwargs['top_k'], label="Top k",
                                 info='Num. tokens to sample from'
                             )
+                            # FIXME: https://github.com/h2oai/h2ogpt/issues/106
+                            if os.getenv('TESTINGFAIL'):
+                                 max_beams = 8 if not (is_low_mem or is_public) else 1
+                            else:
+                                max_beams = 1
                             num_beams = gr.Slider(minimum=1, maximum=max_beams, step=1,
                                                   value=min(max_beams, kwargs['num_beams']), label="Beams",
                                                   info="Number of searches for optimal overall probability.  "
                             )
                             early_stopping = gr.Checkbox(label="EarlyStopping", info="Stop early in beam search",
                                                          value=kwargs['early_stopping'])
+                            max_max_time = 60 * 5 if not is_public else 60 * 2
+                            if is_hf:
+                                max_max_time = min(max_max_time, 60 * 1)
                             max_time = gr.Slider(minimum=0, maximum=max_max_time, step=1,
                                                  value=min(max_max_time, kwargs['max_time']), label="Max. time",
                                                  info="Max. time to search optimal output.")
                                     model_gpu = gr.Dropdown(n_gpus_list,
                                                             label="GPU ID 2 [-1 = all GPUs, if Choose is enabled]",
                                                             value=kwargs['gpu_id'])
+                                    model_used = gr.Textbox(label="Current Model", value=kwargs['base_model'],
+                                                            interactive=False)
                                     lora_used = gr.Textbox(label="Current LORA", value=kwargs['lora_weights'],
+                                                           visible=kwargs['show_lora'], interactive=False)
                             with gr.Row():
                                 with gr.Column(scale=50):
                                     new_model = gr.Textbox(label="New Model HF name/path")
                         with gr.Column():
                             with gr.Row():
                                 system_btn = gr.Button(value='Get System Info')
+                                system_text = gr.Textbox(label='System Info', interactive=False)
                             with gr.Row():
                                 zip_btn = gr.Button("Zip")
+                                zip_text = gr.Textbox(label="Zip file name", interactive=False)
                                 file_output = gr.File()
                             with gr.Row():
                                 s3up_btn = gr.Button("S3UP")
+                                s3up_text = gr.Textbox(label='S3UP result', interactive=False)
         # Get flagged data
         zip_data1 = functools.partial(zip_data, root_dirs=['flagged_data_points', kwargs['save_dir']])
         dark_mode_btn = gr.Button("Dark Mode", variant="primary").style(
             size="sm",
         )
+        # FIXME: Could add exceptions for non-chat but still streaming
+        exception_text = gr.Textbox(value="", visible=kwargs['chat'], label='Chat Exceptions', interactive=False)
         dark_mode_btn.click(
             None,
             None,
             None,
             _js=get_dark_js(),
             api_name="dark" if allow_api else None,
+            queue=False,
         )
         # Control chat and non-chat blocks, which can be independently used by chat checkbox swap
         chat.select(col_nochat_fun, chat, col_nochat, api_name="chat_checkbox" if allow_api else None) \
             .then(col_chat_fun, chat, col_chat) \
+            .then(context_fun, chat, context) \
+            .then(col_chat_fun, chat, exception_text)
         # examples after submit or any other buttons for chat or no chat
         if kwargs['examples'] is not None and kwargs['show_examples']:
             if sanitize_user_prompt:
                 from better_profanity import profanity
                 user_message1 = profanity.censor(user_message1)
+            if user_message1 in ['']:
+                # e.g. when user just hits enter in textbox,
+                # else will have <human>: <bot>: on single line, which seems to be "ok" for LLM but not usual
+                user_message1 = '\n'
             history = args_list[-1]
             if undo and history:
             :param retry:
             :return:
             """
+            args_list = copy.deepcopy(list(args))
             history = args_list[-1]  # model_state is -2
             if retry and history:
                 history.pop()
             if not history:
                 print("No history", flush=True)
+                history = [['', None]]
+                yield history, ''
                 return
             # ensure output will be unique to models
+            history = copy.deepcopy(history)
             instruction1 = history[-1][0]
             context1 = ''
             if kwargs['chat_history'] > 0:
             args_list[2] = context1[-kwargs['chat_history']:]
             model_state1 = args_list[-2]
             if model_state1[0] is None or model_state1[0] == no_model_str:
+                history = [['', None]]
+                yield history, ''
                 return
             args_list = args_list[:-2]
             fun1 = partial(evaluate,
                 for output in fun1(*tuple(args_list)):
                     bot_message = output
                     history[-1][1] = bot_message
+                    yield history, ''
             except StopIteration:
+                yield history, ''
             except RuntimeError as e:
                 if "generator raised StopIteration" in str(e):
                     # assume last entry was bad, undo
                     history.pop()
+                    yield history, ''
+                else:
+                    if history and len(history) > 0 and len(history[0]) > 1 and history[-1][1] is None:
+                        history[-1][1] = ''
+                    yield history, str(e)
+                    raise
             except Exception as e:
                 # put error into user input
+                ex = "Exception: %s" % str(e)
+                if history and len(history) > 0 and len(history[0]) > 1 and history[-1][1] is None:
+                    history[-1][1] = ''
+                yield history, ex
                 raise
             return
                          )
         bot_args = dict(fn=bot,
                         inputs=inputs_list + [model_state] + [text_output],
+                        outputs=[text_output, exception_text],
                         )
         retry_bot_args = dict(fn=functools.partial(bot, retry=True),
                               inputs=inputs_list + [model_state] + [text_output],
+                              outputs=[text_output, exception_text],
                               )
         undo_user_args = dict(fn=functools.partial(user, undo=True),
                               inputs=inputs_list + [text_output],
                           )
         bot_args2 = dict(fn=bot,
                          inputs=inputs_list + [model_state2] + [text_output2],
+                         outputs=[text_output2, exception_text],
                          )
         retry_bot_args2 = dict(fn=functools.partial(bot, retry=True),
                                inputs=inputs_list + [model_state2] + [text_output2],
+                               outputs=[text_output2, exception_text],
                                )
         undo_user_args2 = dict(fn=functools.partial(user, undo=True),
                                inputs=inputs_list + [text_output2],
             return gr.Textbox.update(value='')
         if kwargs['auto_score']:
+            score_args_submit = score_args
+            score_args2_submit = score_args2
         else:
+            score_args_submit = dict(fn=lambda: None, inputs=None, outputs=None)
+            score_args2_submit = dict(fn=lambda: None, inputs=None, outputs=None)
+        # in case 2nd model, consume instruction first, so can clear quickly
+        # bot doesn't consume instruction itself, just history from user, so why works
+        submit_event1a = instruction.submit(**user_args, queue=queue,
+                                            api_name='instruction' if allow_api else None)
+        submit_event1b = submit_event1a.then(**user_args2, api_name='instruction2' if allow_api else None)
+        submit_event1c = submit_event1b.then(clear_instruct, None, instruction) \
+            .then(clear_instruct, None, iinput)
+        submit_event1d = submit_event1c.then(**bot_args, api_name='instruction_bot' if allow_api else None,
+                                             queue=queue)
+        submit_event1e = submit_event1d.then(**score_args_submit, api_name='instruction_bot_score' if allow_api else None,
+                                             queue=queue)
+        submit_event1f = submit_event1e.then(**bot_args2, api_name='instruction_bot2' if allow_api else None,
+                                             queue=queue)
+        submit_event1g = submit_event1f.then(**score_args2_submit,
+                                             api_name='instruction_bot_score2' if allow_api else None, queue=queue)
+        submit_event1h = submit_event1g.then(clear_torch_cache)
+        submit_event2a = submit.click(**user_args, api_name='submit' if allow_api else None)
+        submit_event2b = submit_event2a.then(**user_args2, api_name='submit2' if allow_api else None)
+        submit_event2c = submit_event2b.then(clear_instruct, None, instruction) \
+            .then(clear_instruct, None, iinput)
+        submit_event2d = submit_event2c.then(**bot_args, api_name='submit_bot' if allow_api else None, queue=queue)
+        submit_event2e = submit_event2d.then(**score_args_submit, api_name='submit_bot_score' if allow_api else None,
+                                             queue=queue)
+        submit_event2f = submit_event2e.then(**bot_args2, api_name='submit_bot2' if allow_api else None, queue=queue)
+        submit_event2g = submit_event2f.then(**score_args2_submit, api_name='submit_bot_score2' if allow_api else None,
+                                             queue=queue)
+        submit_event2h = submit_event2g.then(clear_torch_cache)
+        submit_event3a = retry.click(**user_args, api_name='retry' if allow_api else None)
+        submit_event3b = submit_event3a.then(**user_args2, api_name='retry2' if allow_api else None)
+        submit_event3c = submit_event3b.then(clear_instruct, None, instruction) \
+            .then(clear_instruct, None, iinput)
+        submit_event3d = submit_event3c.then(**retry_bot_args, api_name='retry_bot' if allow_api else None,
+                                             queue=queue)
+        submit_event3e = submit_event3d.then(**score_args_submit, api_name='retry_bot_score' if allow_api else None,
+                                             queue=queue)
+        submit_event3f = submit_event3e.then(**retry_bot_args2, api_name='retry_bot2' if allow_api else None,
+                                             queue=queue)
+        submit_event3g = submit_event3f.then(**score_args2_submit, api_name='retry_bot_score2' if allow_api else None,
+                                             queue=queue)
+        submit_event3h = submit_event3g.then(clear_torch_cache)
+        submit_event4 = undo.click(**undo_user_args, api_name='undo' if allow_api else None) \
+            .then(**undo_user_args2, api_name='undo2' if allow_api else None) \
+            .then(clear_instruct, None, instruction) \
+            .then(clear_instruct, None, iinput) \
+            .then(**score_args_submit, api_name='undo_score' if allow_api else None) \
+            .then(**score_args2_submit, api_name='undo_score2' if allow_api else None)
         # does both models
         clear.click(lambda: None, None, text_output, queue=False, api_name='clear' if allow_api else None) \
                                         api_name='system_info' if allow_api else None, queue=False)
         # don't pass text_output, don't want to clear output, just stop it
+        # cancel only stops outer generation, not inner generation or non-generation
         stop_btn.click(lambda: None, None, None,
+                       cancels=[submit_event1d, submit_event1f,
+                                submit_event2d, submit_event2f,
+                                submit_event3d, submit_event3f,
+                                submit_event_nochat],
                        queue=False, api_name='stop' if allow_api else None).then(clear_torch_cache, queue=False)
         demo.load(None, None, None, _js=get_dark_js() if kwargs['h2ocolors'] else None)
 input_args_list = ['model_state']
+inputs_kwargs_list = ['debug', 'save_dir', 'sanitize_bot_response', 'model_state0', 'is_low_mem',
+                      'raise_generate_gpu_exceptions', 'chat_context', 'concurrency_count', 'lora_weights']
 def get_inputs_list(inputs_dict, model_lower):

utils.py CHANGED Viewed

@@ -259,3 +259,37 @@ def wrapped_partial(func, *args, **kwargs):
     partial_func = functools.partial(func, *args, **kwargs)
     functools.update_wrapper(partial_func, func)
     return partial_func

     partial_func = functools.partial(func, *args, **kwargs)
     functools.update_wrapper(partial_func, func)
     return partial_func
+class ThreadException(Exception):
+    pass
+class EThread(threading.Thread):
+    # Function that raises the custom exception
+    def __init__(self, group=None, target=None, name=None,
+                 args=(), kwargs=None, *, daemon=None, bucket=None):
+        self.bucket = bucket
+        self.streamer = kwargs.get('streamer')
+        self.exc = None
+        super().__init__(group=group, target=target, name=name, args=args, kwargs=kwargs, daemon=daemon)
+    def run(self):
+        # Variable that stores the exception, if raised by someFunction
+        try:
+            super().run()
+        except BaseException as e:
+            print("thread exception: %s" % str(sys.exc_info()))
+            self.bucket.put(sys.exc_info())
+            self.exc = e
+            if self.streamer:
+                print("make stop: %s" % str(sys.exc_info()), flush=True)
+                self.streamer.do_stop = True
+    def join(self, timeout=None):
+        threading.Thread.join(self)
+        # Since join() returns in caller thread
+        # we re-raise the caught exception
+        # if any was caught
+        if self.exc:
+            raise self.exc