tags, keeping original quotes + replaced_text = re.sub(pattern, lambda m: f'{m.group(1)}{m.group(2)}{m.group(3)}', text, flags=re.DOTALL) + + return replaced_text + + def replace_blockquote(m): return m.group().replace('\n', '\n> ').replace('\\begin{blockquote}', '').replace('\\end{blockquote}', '') -@functools.lru_cache(maxsize=4096) +@functools.lru_cache(maxsize=None) def convert_to_markdown(string): + # Make \[ \] LaTeX equations inline + pattern = r'^\s*\\\[\s*\n([\s\S]*?)\n\s*\\\]\s*$' + replacement = r'\\[ \1 \\]' + string = re.sub(pattern, replacement, string, flags=re.MULTILINE) + + # Escape backslashes + string = string.replace('\\', '\\\\') + + # Quote to + string = replace_quotes(string) + # Blockquote string = re.sub(r'(^|[\n])>', r'\1>', string) pattern = re.compile(r'\\begin{blockquote}(.*?)\\end{blockquote}', re.DOTALL) @@ -69,12 +103,27 @@ def convert_to_markdown(string): result = '' is_code = False + is_latex = False for line in string.split('\n'): - if line.lstrip(' ').startswith('```'): + stripped_line = line.strip() + + if stripped_line.startswith('```'): is_code = not is_code + elif stripped_line.startswith('$$'): + is_latex = not is_latex + elif stripped_line.endswith('$$'): + is_latex = False + elif stripped_line.startswith('\\\\['): + is_latex = True + elif stripped_line.startswith('\\\\]'): + is_latex = False + elif stripped_line.endswith('\\\\]'): + is_latex = False result += line - if is_code or line.startswith('|'): # Don't add an extra \n for tables or code + + # Don't add an extra \n for tables, code, or LaTeX + if is_code or is_latex or line.startswith('|'): result += '\n' else: result += '\n\n' @@ -85,15 +134,20 @@ def convert_to_markdown(string): # Unfinished list, like "\n1.". A |delete| string is added and then # removed to force aor
to be generated instead of a
. - if re.search(r'(\n\d+\.?|\n\*\s*)$', result): + list_item_pattern = r'(\n\d+\.?|\n\s*[-*+]\s*([*_~]{1,3})?)$' + if re.search(list_item_pattern, result): delete_str = '|delete|' if re.search(r'(\d+\.?)$', result) and not result.endswith('.'): result += '.' - result = re.sub(r'(\n\d+\.?|\n\*\s*)$', r'\g<1> ' + delete_str, result) + # Add the delete string after the list item + result = re.sub(list_item_pattern, r'\g<1> ' + delete_str, result) + # Convert to HTML using markdown html_output = markdown.markdown(result, extensions=['fenced_code', 'tables']) + + # Remove the delete string from the HTML output pos = html_output.rfind(delete_str) if pos > -1: html_output = html_output[:pos] + html_output[pos + len(delete_str):] @@ -119,6 +173,7 @@ def convert_to_markdown_wrapped(string, use_cache=True): def generate_basic_html(string): + convert_to_markdown.cache_clear() string = convert_to_markdown(string) string = f'
{string}' return string diff --git a/modules/llama_cpp_python_hijack.py b/modules/llama_cpp_python_hijack.py old mode 100755 new mode 100644 index c6b09a7344d7cda263b2407c8e17338f7f7f8f19..64280dc9a01e75cece4b778ebe636d7382b956fb --- a/modules/llama_cpp_python_hijack.py +++ b/modules/llama_cpp_python_hijack.py @@ -1,3 +1,5 @@ +import importlib +import platform from typing import Sequence from tqdm import tqdm @@ -5,20 +7,46 @@ from tqdm import tqdm from modules import shared from modules.cache_utils import process_llamacpp_cache -try: - import llama_cpp -except: - llama_cpp = None -try: - import llama_cpp_cuda -except: - llama_cpp_cuda = None +imported_module = None -try: - import llama_cpp_cuda_tensorcores -except: - llama_cpp_cuda_tensorcores = None + +def llama_cpp_lib(): + global imported_module + + # Determine the platform + is_macos = platform.system() == 'Darwin' + + # Define the library names based on the platform + if is_macos: + lib_names = [ + (None, 'llama_cpp') + ] + else: + lib_names = [ + ('cpu', 'llama_cpp'), + ('tensorcores', 'llama_cpp_cuda_tensorcores'), + (None, 'llama_cpp_cuda'), + (None, 'llama_cpp') + ] + + for arg, lib_name in lib_names: + should_import = (arg is None or getattr(shared.args, arg)) + + if should_import: + if imported_module and imported_module != lib_name: + # Conflict detected, raise an exception + raise Exception(f"Cannot import `{lib_name}` because `{imported_module}` is already imported. Switching to a different version of llama-cpp-python currently requires a server restart.") + + try: + return_lib = importlib.import_module(lib_name) + imported_module = lib_name + monkey_patch_llama_cpp_python(return_lib) + return return_lib + except ImportError: + continue + + return None def eval_with_progress(self, tokens: Sequence[int]): @@ -63,10 +91,12 @@ def eval_with_progress(self, tokens: Sequence[int]): self.n_tokens += n_tokens -def monkey_patch_generate(lib): +def monkey_patch_llama_cpp_python(lib): + if getattr(lib.Llama, '_is_patched', False): + # If the patch is already applied, do nothing + return def my_generate(self, *args, **kwargs): - if shared.args.streaming_llm: new_sequence = args[0] past_sequence = self._input_ids @@ -77,11 +107,9 @@ def monkey_patch_generate(lib): for output in self.original_generate(*args, **kwargs): yield output + lib.Llama.eval = eval_with_progress lib.Llama.original_generate = lib.Llama.generate lib.Llama.generate = my_generate - -for lib in [llama_cpp, llama_cpp_cuda, llama_cpp_cuda_tensorcores]: - if lib is not None: - lib.Llama.eval = eval_with_progress - monkey_patch_generate(lib) \ No newline at end of file + # Set the flag to indicate that the patch has been applied + lib.Llama._is_patched = True diff --git a/modules/llamacpp_hf.py b/modules/llamacpp_hf.py old mode 100755 new mode 100644 index 1bfd667dbcfb91977f4a62a4d6f895ce08861c47..327e3a7b438e954e3052a4b00b3680746a3a0238 --- a/modules/llamacpp_hf.py +++ b/modules/llamacpp_hf.py @@ -7,35 +7,10 @@ from torch.nn import CrossEntropyLoss from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel from transformers.modeling_outputs import CausalLMOutputWithPast -from modules import RoPE, llama_cpp_python_hijack, shared +from modules import shared +from modules.llama_cpp_python_hijack import llama_cpp_lib from modules.logging_colors import logger -try: - import llama_cpp -except: - llama_cpp = None - -try: - import llama_cpp_cuda -except: - llama_cpp_cuda = None - -try: - import llama_cpp_cuda_tensorcores -except: - llama_cpp_cuda_tensorcores = None - - -def llama_cpp_lib(): - if shared.args.cpu and llama_cpp is not None: - return llama_cpp - elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None: - return llama_cpp_cuda_tensorcores - elif llama_cpp_cuda is not None: - return llama_cpp_cuda - else: - return llama_cpp - class LlamacppHF(PreTrainedModel): def __init__(self, model, path): @@ -212,14 +187,22 @@ class LlamacppHF(PreTrainedModel): 'mul_mat_q': not shared.args.no_mul_mat_q, 'numa': shared.args.numa, 'n_gpu_layers': shared.args.n_gpu_layers, - 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), + 'rope_freq_base': shared.args.rope_freq_base, 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'logits_all': shared.args.logits_all, 'offload_kqv': not shared.args.no_offload_kqv, - 'split_mode': 1 if not shared.args.row_split else 2 + 'split_mode': 1 if not shared.args.row_split else 2, + 'flash_attn': shared.args.flash_attn } + if shared.args.cache_4bit: + params["type_k"] = 2 + params["type_v"] = 2 + elif shared.args.cache_8bit: + params["type_k"] = 8 + params["type_v"] = 8 + Llama = llama_cpp_lib().Llama model = Llama(**params) diff --git a/modules/llamacpp_model.py b/modules/llamacpp_model.py old mode 100755 new mode 100644 index 8bc9b7cb0be237fe0f47f31350f1e6a74ef208dc..a16230caf3d57c8c0280216588333ebe079d0508 --- a/modules/llamacpp_model.py +++ b/modules/llamacpp_model.py @@ -4,37 +4,12 @@ from functools import partial import numpy as np import torch -from modules import RoPE, llama_cpp_python_hijack, shared +from modules import shared from modules.callbacks import Iteratorize +from modules.llama_cpp_python_hijack import llama_cpp_lib from modules.logging_colors import logger from modules.text_generation import get_max_prompt_length -try: - import llama_cpp -except: - llama_cpp = None - -try: - import llama_cpp_cuda -except: - llama_cpp_cuda = None - -try: - import llama_cpp_cuda_tensorcores -except: - llama_cpp_cuda_tensorcores = None - - -def llama_cpp_lib(): - if shared.args.cpu and llama_cpp is not None: - return llama_cpp - elif shared.args.tensorcores and llama_cpp_cuda_tensorcores is not None: - return llama_cpp_cuda_tensorcores - elif llama_cpp_cuda is not None: - return llama_cpp_cuda - else: - return llama_cpp - def ban_eos_logits_processor(eos_token, input_ids, logits): logits[eos_token] = -float('inf') @@ -92,13 +67,21 @@ class LlamaCppModel: 'mul_mat_q': not shared.args.no_mul_mat_q, 'numa': shared.args.numa, 'n_gpu_layers': shared.args.n_gpu_layers, - 'rope_freq_base': RoPE.get_rope_freq_base(shared.args.alpha_value, shared.args.rope_freq_base), + 'rope_freq_base': shared.args.rope_freq_base, 'tensor_split': tensor_split_list, 'rope_freq_scale': 1.0 / shared.args.compress_pos_emb, 'offload_kqv': not shared.args.no_offload_kqv, - 'split_mode': 1 if not shared.args.row_split else 2 + 'split_mode': 1 if not shared.args.row_split else 2, + 'flash_attn': shared.args.flash_attn } + if shared.args.cache_4bit: + params["type_k"] = 2 + params["type_v"] = 2 + elif shared.args.cache_8bit: + params["type_k"] = 8 + params["type_v"] = 8 + result.model = Llama(**params) if cache_capacity > 0: result.model.set_cache(LlamaCache(capacity_bytes=cache_capacity)) diff --git a/modules/loaders.py b/modules/loaders.py old mode 100755 new mode 100644 index 234773397df378f9fd7cff4ed400263f4c127083..549de5fb020c93a32f5c52e6677adbc313cd4974 --- a/modules/loaders.py +++ b/modules/loaders.py @@ -21,8 +21,8 @@ loaders_and_params = OrderedDict({ 'trust_remote_code', 'no_use_fast', 'use_flash_attention_2', + 'use_eager_attention', 'alpha_value', - 'rope_freq_base', 'compress_pos_emb', 'disable_exllama', 'disable_exllamav2', @@ -31,6 +31,8 @@ loaders_and_params = OrderedDict({ 'llama.cpp': [ 'n_ctx', 'n_gpu_layers', + 'cache_8bit', + 'cache_4bit', 'tensor_split', 'n_batch', 'threads', @@ -38,7 +40,6 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'no_mul_mat_q', - 'alpha_value', 'rope_freq_base', 'compress_pos_emb', 'cpu', @@ -46,12 +47,15 @@ loaders_and_params = OrderedDict({ 'no_offload_kqv', 'row_split', 'tensorcores', + 'flash_attn', 'streaming_llm', 'attention_sink_size', ], 'llamacpp_HF': [ 'n_ctx', 'n_gpu_layers', + 'cache_8bit', + 'cache_4bit', 'tensor_split', 'n_batch', 'threads', @@ -59,7 +63,6 @@ loaders_and_params = OrderedDict({ 'no_mmap', 'mlock', 'no_mul_mat_q', - 'alpha_value', 'rope_freq_base', 'compress_pos_emb', 'cpu', @@ -71,6 +74,7 @@ loaders_and_params = OrderedDict({ 'no_offload_kqv', 'row_split', 'tensorcores', + 'flash_attn', 'streaming_llm', 'attention_sink_size', 'llamacpp_HF_info', @@ -80,6 +84,8 @@ loaders_and_params = OrderedDict({ 'max_seq_len', 'cfg_cache', 'no_flash_attn', + 'no_xformers', + 'no_sdpa', 'num_experts_per_token', 'cache_8bit', 'cache_4bit', @@ -93,6 +99,8 @@ loaders_and_params = OrderedDict({ 'gpu_split', 'max_seq_len', 'no_flash_attn', + 'no_xformers', + 'no_sdpa', 'num_experts_per_token', 'cache_8bit', 'cache_4bit', @@ -103,7 +111,6 @@ loaders_and_params = OrderedDict({ ], 'AutoGPTQ': [ 'triton', - 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'wbits', @@ -120,34 +127,15 @@ loaders_and_params = OrderedDict({ 'no_use_fast', 'autogptq_info', ], - 'AutoAWQ': [ - 'cpu_memory', - 'gpu_memory', - 'auto_devices', - 'max_seq_len', - 'no_inject_fused_attention', - 'trust_remote_code', - 'no_use_fast', - ], - 'GPTQ-for-LLaMa': [ - 'wbits', - 'groupsize', - 'model_type', - 'pre_layer', - 'trust_remote_code', - 'no_use_fast', - 'gptq_for_llama_info', - ], - 'QuIP#': [ - 'trust_remote_code', - 'no_use_fast', - 'no_flash_attn', - 'quipsharp_info', - ], 'HQQ': [ 'hqq_backend', 'trust_remote_code', 'no_use_fast', + ], + 'TensorRT-LLM': [ + 'max_seq_len', + 'cpp_runner', + 'tensorrt_llm_info', ] }) @@ -176,6 +164,10 @@ def transformers_samplers(): 'repetition_penalty_range', 'encoder_repetition_penalty', 'no_repeat_ngram_size', + 'dry_multiplier', + 'dry_base', + 'dry_allowed_length', + 'dry_sequence_breakers', 'seed', 'do_sample', 'penalty_alpha', @@ -199,9 +191,6 @@ def transformers_samplers(): loaders_samplers = { 'Transformers': transformers_samplers(), 'AutoGPTQ': transformers_samplers(), - 'GPTQ-for-LLaMa': transformers_samplers(), - 'AutoAWQ': transformers_samplers(), - 'QuIP#': transformers_samplers(), 'HQQ': transformers_samplers(), 'ExLlamav2': { 'temperature', @@ -249,6 +238,10 @@ loaders_samplers = { 'repetition_penalty_range', 'encoder_repetition_penalty', 'no_repeat_ngram_size', + 'dry_multiplier', + 'dry_base', + 'dry_allowed_length', + 'dry_sequence_breakers', 'seed', 'do_sample', 'mirostat_mode', @@ -307,6 +300,10 @@ loaders_samplers = { 'repetition_penalty_range', 'encoder_repetition_penalty', 'no_repeat_ngram_size', + 'dry_multiplier', + 'dry_base', + 'dry_allowed_length', + 'dry_sequence_breakers', 'seed', 'do_sample', 'mirostat_mode', @@ -323,15 +320,16 @@ loaders_samplers = { 'skip_special_tokens', 'auto_max_new_tokens', }, -} - -loaders_model_types = { - 'GPTQ-for-LLaMa': [ - "None", - "llama", - "opt", - "gptj" - ], + 'TensorRT-LLM': { + 'temperature', + 'top_p', + 'top_k', + 'repetition_penalty', + 'presence_penalty', + 'frequency_penalty', + 'ban_eos_token', + 'auto_max_new_tokens', + } } @@ -361,13 +359,6 @@ def blacklist_samplers(loader, dynamic_temperature): return output -def get_model_types(loader): - if loader in loaders_model_types: - return loaders_model_types[loader] - - return ["None"] - - def get_gpu_memory_keys(): return [k for k in shared.gradio if k.startswith('gpu_memory')] diff --git a/modules/logging_colors.py b/modules/logging_colors.py old mode 100755 new mode 100644 diff --git a/modules/logits.py b/modules/logits.py old mode 100755 new mode 100644 index f2fd233b34674b40a4c91489fb340d511b062915..73cabb41b8605708ef9acf5e91c0b8cfdcc654a9 --- a/modules/logits.py +++ b/modules/logits.py @@ -1,14 +1,39 @@ +import time +import traceback + import torch from transformers import is_torch_npu_available, is_torch_xpu_available -from modules import sampler_hijack, shared +from modules import models, sampler_hijack, shared from modules.logging_colors import logger +from modules.models import load_model from modules.text_generation import generate_reply global_scores = None -def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False): +def get_next_logits(*args, **kwargs): + if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: + shared.model, shared.tokenizer = load_model(shared.model_name) + + needs_lock = not args[2] # use_samplers + if needs_lock: + shared.generation_lock.acquire() + + try: + result = _get_next_logits(*args, **kwargs) + except Exception: + traceback.print_exc() + result = None + + if needs_lock: + models.last_generation_time = time.time() + shared.generation_lock.release() + + return result + + +def _get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return_dict=False): if shared.model is None: logger.error("No model is loaded! Select one in the Model tab.") return 'Error: No model is loaded1 Select one in the Model tab.', previous @@ -66,7 +91,14 @@ def get_next_logits(prompt, state, use_samplers, previous, top_logits=25, return topk_values = [float(i) for i in topk_values] output = {} for row in list(zip(topk_values, tokens)): - output[row[1]] = row[0] + key = row[1] + if isinstance(key, bytes): + try: + key = key.decode() + except: + key = key.decode('latin') + + output[key] = row[0] return output else: diff --git a/modules/metadata_gguf.py b/modules/metadata_gguf.py old mode 100755 new mode 100644 diff --git a/modules/models.py b/modules/models.py old mode 100755 new mode 100644 index c89bcae4dd2ddaf51fd702544327095f241d7997..b0e2346e18911016a253f1ea2d78b23fb81bf104 --- a/modules/models.py +++ b/modules/models.py @@ -1,5 +1,4 @@ import gc -import logging import os import pprint import re @@ -26,10 +25,9 @@ from transformers import ( ) import modules.shared as shared -from modules import RoPE, sampler_hijack +from modules import sampler_hijack from modules.logging_colors import logger from modules.models_settings import get_model_metadata -from modules.relative_imports import RelativeImport transformers.logging.set_verbosity_error() @@ -61,6 +59,9 @@ if shared.args.deepspeed: sampler_hijack.hijack_samplers() +last_generation_time = time.time() + + def load_model(model_name, loader=None): logger.info(f"Loading \"{model_name}\"") t0 = time.time() @@ -70,14 +71,12 @@ def load_model(model_name, loader=None): load_func_map = { 'Transformers': huggingface_loader, 'AutoGPTQ': AutoGPTQ_loader, - 'GPTQ-for-LLaMa': GPTQ_loader, 'llama.cpp': llamacpp_loader, 'llamacpp_HF': llamacpp_HF_loader, 'ExLlamav2': ExLlamav2_loader, 'ExLlamav2_HF': ExLlamav2_HF_loader, - 'AutoAWQ': AutoAWQ_loader, - 'QuIP#': QuipSharp_loader, 'HQQ': HQQ_loader, + 'TensorRT-LLM': TensorRT_LLM_loader, } metadata = get_model_metadata(model_name) @@ -99,24 +98,28 @@ def load_model(model_name, loader=None): if model is None: return None, None else: - tokenizer = load_tokenizer(model_name, model) + tokenizer = load_tokenizer(model_name) shared.settings.update({k: v for k, v in metadata.items() if k in shared.settings}) - if loader.lower().startswith('exllama'): + if loader.lower().startswith('exllama') or loader.lower().startswith('tensorrt'): shared.settings['truncation_length'] = shared.args.max_seq_len elif loader in ['llama.cpp', 'llamacpp_HF']: shared.settings['truncation_length'] = shared.args.n_ctx + logger.info(f"Loaded \"{model_name}\" in {(time.time()-t0):.2f} seconds.") logger.info(f"LOADER: \"{loader}\"") logger.info(f"TRUNCATION LENGTH: {shared.settings['truncation_length']}") logger.info(f"INSTRUCTION TEMPLATE: \"{metadata['instruction_template']}\"") - logger.info(f"Loaded the model in {(time.time()-t0):.2f} seconds.") return model, tokenizer -def load_tokenizer(model_name, model): +def load_tokenizer(model_name, tokenizer_dir=None): + if tokenizer_dir: + path_to_model = Path(tokenizer_dir) + else: + path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") + tokenizer = None - path_to_model = Path(f"{shared.args.model_dir}/{model_name}/") if path_to_model.exists(): if shared.args.no_use_fast: logger.info('Loading the tokenizer with use_fast=False.') @@ -146,6 +149,9 @@ def huggingface_loader(model_name): if shared.args.force_safetensors: params['force_safetensors'] = True + if shared.args.use_eager_attention: + params['attn_implementation'] = 'eager' + config = AutoConfig.from_pretrained(path_to_model, trust_remote_code=shared.args.trust_remote_code) if 'chatglm' in model_name.lower(): @@ -249,7 +255,7 @@ def huggingface_loader(model_name): if shared.args.compress_pos_emb > 1: params['rope_scaling'] = {'type': 'linear', 'factor': shared.args.compress_pos_emb} elif shared.args.alpha_value > 1: - params['rope_scaling'] = {'type': 'dynamic', 'factor': RoPE.get_alpha_value(shared.args.alpha_value, shared.args.rope_freq_base)} + params['rope_scaling'] = {'type': 'dynamic', 'factor': shared.args.alpha_value} logger.info("TRANSFORMERS_PARAMS=") pprint.PrettyPrinter(indent=4, sort_dicts=False).pprint(params) @@ -266,7 +272,7 @@ def llamacpp_loader(model_name): if path.is_file(): model_file = path else: - model_file = list(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] + model_file = sorted(Path(f'{shared.args.model_dir}/{model_name}').glob('*.gguf'))[0] logger.info(f"llama.cpp weights detected: \"{model_file}\"") model, tokenizer = LlamaCppModel.from_pretrained(model_file) @@ -276,84 +282,24 @@ def llamacpp_loader(model_name): def llamacpp_HF_loader(model_name): from modules.llamacpp_hf import LlamacppHF - path = Path(f'{shared.args.model_dir}/{model_name}') - - # Check if a HF tokenizer is available for the model - if all((path / file).exists() for file in ['tokenizer_config.json']): - logger.info(f'Using tokenizer from: \"{path}\"') + if shared.args.tokenizer_dir: + logger.info(f'Using tokenizer from: \"{shared.args.tokenizer_dir}\"') else: - logger.error("Could not load the model because a tokenizer in Transformers format was not found.") - return None, None + path = Path(f'{shared.args.model_dir}/{model_name}') + # Check if a HF tokenizer is available for the model + if all((path / file).exists() for file in ['tokenizer_config.json']): + logger.info(f'Using tokenizer from: \"{path}\"') + else: + logger.error("Could not load the model because a tokenizer in Transformers format was not found.") + return None, None model = LlamacppHF.from_pretrained(model_name) - return model - - -def AutoAWQ_loader(model_name): - from awq import AutoAWQForCausalLM - - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - - model = AutoAWQForCausalLM.from_quantized( - quant_path=model_dir, - max_new_tokens=shared.args.max_seq_len, - trust_remote_code=shared.args.trust_remote_code, - fuse_layers=not shared.args.no_inject_fused_attention, - max_memory=get_max_memory_dict(), - batch_size=1, - safetensors=any(model_dir.glob('*.safetensors')), - ) - - return model - - -def QuipSharp_loader(model_name): - try: - with RelativeImport("repositories/quip-sharp"): - from lib.utils.unsafe_import import model_from_hf_path - except: - logger.error( - "\nQuIP# has not been found. It must be installed manually for now.\n" - "For instructions on how to do that, please consult:\n" - "https://github.com/oobabooga/text-generation-webui/pull/4803\n" - ) - return None, None - - # This fixes duplicate logging messages after the import above. - handlers = logging.getLogger().handlers - if len(handlers) > 1: - logging.getLogger().removeHandler(handlers[1]) - model_dir = Path(f'{shared.args.model_dir}/{model_name}') - if not all((model_dir / file).exists() for file in ['tokenizer_config.json', 'special_tokens_map.json', 'tokenizer.model']): - logger.error(f"Could not load the model because the tokenizer files could not be found in the model folder. Please download the following files from the original (unquantized) model into {model_dir}: special_tokens_map.json, tokenizer.json, tokenizer.model, tokenizer_config.json.") - return None, None - - model, model_str = model_from_hf_path( - model_dir, - use_cuda_graph=False, - use_flash_attn=not shared.args.no_flash_attn - ) - - return model - - -def GPTQ_loader(model_name): - - # Monkey patch - if shared.args.monkey_patch: - logger.warning("Applying the monkey patch for using LoRAs with GPTQ models. It may cause undefined behavior outside its intended scope.") - from modules.monkey_patch_gptq_lora import load_model_llama - - model, _ = load_model_llama(model_name) - - # No monkey patch + if shared.args.tokenizer_dir: + tokenizer = load_tokenizer(model_name, tokenizer_dir=shared.args.tokenizer_dir) + return model, tokenizer else: - import modules.GPTQ_loader - - model = modules.GPTQ_loader.load_quantized(model_name) - - return model + return model def AutoGPTQ_loader(model_name): @@ -377,16 +323,23 @@ def ExLlamav2_HF_loader(model_name): def HQQ_loader(model_name): from hqq.core.quantize import HQQBackend, HQQLinear - from hqq.engine.hf import HQQModelForCausalLM + from hqq.models.hf.base import AutoHQQHFModel logger.info(f"Loading HQQ model with backend: \"{shared.args.hqq_backend}\"") model_dir = Path(f'{shared.args.model_dir}/{model_name}') - model = HQQModelForCausalLM.from_quantized(str(model_dir)) + model = AutoHQQHFModel.from_quantized(str(model_dir)) HQQLinear.set_backend(getattr(HQQBackend, shared.args.hqq_backend)) return model +def TensorRT_LLM_loader(model_name): + from modules.tensorrt_llm import TensorRTLLMModel + + model = TensorRTLLMModel.from_pretrained(model_name) + return model + + def get_max_memory_dict(): max_memory = {} max_cpu_memory = shared.args.cpu_memory.strip() if shared.args.cpu_memory is not None else '99GiB' @@ -426,14 +379,34 @@ def clear_torch_cache(): torch.cuda.empty_cache() -def unload_model(): +def unload_model(keep_model_name=False): shared.model = shared.tokenizer = None - shared.model_name = 'None' shared.lora_names = [] shared.model_dirty_from_training = False clear_torch_cache() + if not keep_model_name: + shared.model_name = 'None' + def reload_model(): unload_model() shared.model, shared.tokenizer = load_model(shared.model_name) + + +def unload_model_if_idle(): + global last_generation_time + + logger.info(f"Setting a timeout of {shared.args.idle_timeout} minutes to unload the model in case of inactivity.") + + while True: + shared.generation_lock.acquire() + try: + if time.time() - last_generation_time > shared.args.idle_timeout * 60: + if shared.model is not None: + logger.info("Unloading the model for inactivity.") + unload_model(keep_model_name=True) + finally: + shared.generation_lock.release() + + time.sleep(60) diff --git a/modules/models_settings.py b/modules/models_settings.py old mode 100755 new mode 100644 index a7fed427b9f11914eb474aaa4c90a71de4ffd6e0..1bb00ceb6a38499811fd457887e3bd0dc4f74ef9 --- a/modules/models_settings.py +++ b/modules/models_settings.py @@ -9,14 +9,16 @@ from modules import chat, loaders, metadata_gguf, shared, ui def get_fallback_settings(): return { + 'bf16': False, + 'use_eager_attention': False, 'wbits': 'None', 'groupsize': 'None', 'desc_act': False, - 'model_type': 'None', 'max_seq_len': 2048, 'n_ctx': 2048, 'rope_freq_base': 0, 'compress_pos_emb': 1, + 'alpha_value': 1, 'truncation_length': shared.settings['truncation_length'], 'skip_special_tokens': shared.settings['skip_special_tokens'], 'custom_stopping_strings': shared.settings['custom_stopping_strings'], @@ -40,12 +42,7 @@ def get_model_metadata(model): hf_metadata = None if 'loader' not in model_settings: - if hf_metadata is not None and 'quip_params' in hf_metadata: - loader = 'QuIP#' - else: - loader = infer_loader(model, model_settings) - - model_settings['loader'] = loader + model_settings['loader'] = infer_loader(model, model_settings) # GGUF metadata if model_settings['loader'] in ['llama.cpp', 'llamacpp_HF']: @@ -56,6 +53,7 @@ def get_model_metadata(model): model_file = list(path.glob('*.gguf'))[0] metadata = metadata_gguf.load_metadata(model_file) + for k in metadata: if k.endswith('context_length'): model_settings['n_ctx'] = metadata[k] @@ -63,10 +61,19 @@ def get_model_metadata(model): model_settings['rope_freq_base'] = metadata[k] elif k.endswith('rope.scale_linear'): model_settings['compress_pos_emb'] = metadata[k] + elif k.endswith('rope.scaling.factor'): + model_settings['compress_pos_emb'] = metadata[k] + elif k.endswith('block_count'): + model_settings['n_gpu_layers'] = metadata[k] + 1 + if 'tokenizer.chat_template' in metadata: template = metadata['tokenizer.chat_template'] eos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.eos_token_id']] - bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']] + if 'tokenizer.ggml.bos_token_id' in metadata: + bos_token = metadata['tokenizer.ggml.tokens'][metadata['tokenizer.ggml.bos_token_id']] + else: + bos_token = "" + template = template.replace('eos_token', "'{}'".format(eos_token)) template = template.replace('bos_token', "'{}'".format(bos_token)) @@ -79,6 +86,9 @@ def get_model_metadata(model): # Transformers metadata if hf_metadata is not None: metadata = json.loads(open(path, 'r', encoding='utf-8').read()) + if 'pretrained_config' in metadata: + metadata = metadata['pretrained_config'] + for k in ['max_position_embeddings', 'model_max_length', 'max_seq_len']: if k in metadata: model_settings['truncation_length'] = metadata[k] @@ -89,10 +99,18 @@ def get_model_metadata(model): elif 'attn_config' in metadata and 'rope_theta' in metadata['attn_config']: model_settings['rope_freq_base'] = metadata['attn_config']['rope_theta'] - if 'rope_scaling' in metadata and type(metadata['rope_scaling']) is dict and all(key in metadata['rope_scaling'] for key in ('type', 'factor')): + if 'rope_scaling' in metadata and isinstance(metadata['rope_scaling'], dict) and all(key in metadata['rope_scaling'] for key in ('type', 'factor')): if metadata['rope_scaling']['type'] == 'linear': model_settings['compress_pos_emb'] = metadata['rope_scaling']['factor'] + # For Gemma-2 + if 'torch_dtype' in metadata and metadata['torch_dtype'] == 'bfloat16': + model_settings['bf16'] = True + + # For Gemma-2 + if 'architectures' in metadata and isinstance(metadata['architectures'], list) and 'Gemma2ForCausalLM' in metadata['architectures']: + model_settings['use_eager_attention'] = True + # Read GPTQ metadata for old GPTQ loaders if 'quantization_config' in metadata and metadata['quantization_config'].get('quant_method', '') != 'exl2': if 'bits' in metadata['quantization_config']: @@ -125,7 +143,7 @@ def get_model_metadata(model): for k in ['eos_token', 'bos_token']: if k in metadata: value = metadata[k] - if type(value) is dict: + if isinstance(value, dict): value = value['content'] template = template.replace(k, "'{}'".format(value)) @@ -160,10 +178,8 @@ def infer_loader(model_name, model_settings): path_to_model = Path(f'{shared.args.model_dir}/{model_name}') if not path_to_model.exists(): loader = None - elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and type(model_settings['wbits']) is int and model_settings['wbits'] > 0): + elif (path_to_model / 'quantize_config.json').exists() or ('wbits' in model_settings and isinstance(model_settings['wbits'], int) and model_settings['wbits'] > 0): loader = 'ExLlamav2_HF' - elif (path_to_model / 'quant_config.json').exists() or re.match(r'.*-awq', model_name.lower()): - loader = 'AutoAWQ' elif len(list(path_to_model.glob('*.gguf'))) > 0 and path_to_model.is_dir() and (path_to_model / 'tokenizer_config.json').exists(): loader = 'llamacpp_HF' elif len(list(path_to_model.glob('*.gguf'))) > 0: @@ -200,20 +216,17 @@ def update_model_parameters(state, initial=False): continue # Setting null defaults - if element in ['wbits', 'groupsize', 'model_type'] and value == 'None': + if element in ['wbits', 'groupsize'] and value == 'None': value = vars(shared.args_defaults)[element] elif element in ['cpu_memory'] and value == 0: value = vars(shared.args_defaults)[element] # Making some simple conversions - if element in ['wbits', 'groupsize', 'pre_layer']: + if element in ['wbits', 'groupsize']: value = int(value) elif element == 'cpu_memory' and value is not None: value = f"{value}MiB" - if element in ['pre_layer']: - value = [value] if value > 0 else None - setattr(shared.args, element, value) found_positive = False @@ -238,7 +251,7 @@ def apply_model_settings_to_state(model, state): loader = model_settings.pop('loader') # If the user is using an alternative loader for the same model type, let them keep using it - if not (loader == 'ExLlamav2_HF' and state['loader'] in ['GPTQ-for-LLaMa', 'ExLlamav2', 'AutoGPTQ']): + if not (loader == 'ExLlamav2_HF' and state['loader'] in ['ExLlamav2', 'AutoGPTQ']): state['loader'] = loader for k in model_settings: diff --git a/modules/monkey_patch_gptq_lora.py b/modules/monkey_patch_gptq_lora.py deleted file mode 100755 index 3166bd33ceba449cb542861b0238818f68c7b02e..0000000000000000000000000000000000000000 --- a/modules/monkey_patch_gptq_lora.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copied from https://github.com/johnsmith0031/alpaca_lora_4bit - -from pathlib import Path - -import alpaca_lora_4bit.autograd_4bit as autograd_4bit -from alpaca_lora_4bit.amp_wrapper import AMPWrapper -from alpaca_lora_4bit.autograd_4bit import ( - Autograd4bitQuantLinear, - load_llama_model_4bit_low_ram -) -from alpaca_lora_4bit.models import Linear4bitLt -from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import ( - replace_peft_model_with_int4_lora_model -) - -from modules import shared -from modules.GPTQ_loader import find_quantized_model_file - -replace_peft_model_with_int4_lora_model() - - -def load_model_llama(model_name): - config_path = str(Path(f'{shared.args.model_dir}/{model_name}')) - model_path = str(find_quantized_model_file(model_name)) - model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=shared.args.groupsize, is_v1_model=False) - for _, m in model.named_modules(): - if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt): - if m.is_v1_model: - m.zeros = m.zeros.half() - m.scales = m.scales.half() - m.bias = m.bias.half() - - autograd_4bit.auto_switch = True - - model.half() - wrapper = AMPWrapper(model) - wrapper.apply_generate() - - return model, tokenizer diff --git a/modules/one_click_installer_check.py b/modules/one_click_installer_check.py old mode 100755 new mode 100644 diff --git a/modules/presets.py b/modules/presets.py old mode 100755 new mode 100644 index a7cda31cd15603a679496214e2909a55dbc8dd79..b00e829eb1779fb58907aacc70b375652ae0ec97 --- a/modules/presets.py +++ b/modules/presets.py @@ -40,6 +40,10 @@ def default_preset(): 'do_sample': True, 'encoder_repetition_penalty': 1, 'no_repeat_ngram_size': 0, + 'dry_multiplier': 0, + 'dry_base': 1.75, + 'dry_allowed_length': 2, + 'dry_sequence_breakers': '"\\n", ":", "\\"", "*"', 'sampler_priority': 'temperature\ndynamic_temperature\nquadratic_sampling\ntop_k\ntop_p\ntypical_p\nepsilon_cutoff\neta_cutoff\ntfs\ntop_a\nmin_p\nmirostat' } diff --git a/modules/prompts.py b/modules/prompts.py old mode 100755 new mode 100644 diff --git a/modules/relative_imports.py b/modules/relative_imports.py old mode 100755 new mode 100644 diff --git a/modules/sampler_hijack.py b/modules/sampler_hijack.py old mode 100755 new mode 100644 index da52f4d0b41e1a3e639ab40b42c23401ce8f32ff..9fb661aecceb2bc2b69a3a2944bd79db9f96e774 --- a/modules/sampler_hijack.py +++ b/modules/sampler_hijack.py @@ -1,3 +1,4 @@ +import json import math import pprint @@ -124,36 +125,6 @@ class QuadraticSamplingLogitsWarper(LogitsWarper): return transformed_logits -class MinPLogitsWarper(LogitsWarper): - def __init__(self, min_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): - if min_p < 0 or min_p > 1.0: - raise ValueError(f"`min_p` has to be a float >= 0 and <= 1, but is {min_p}") - self.min_p = min_p - self.filter_value = filter_value - self.min_tokens_to_keep = min_tokens_to_keep - - def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: - # Convert logits to probabilities - probs = torch.softmax(scores, dim=-1) - # Get the probability of the top token for each sequence in the batch - top_probs, _ = probs.max(dim=-1, keepdim=True) - # Calculate the actual min_p threshold by scaling min_p with the top token's probability - scaled_min_p = self.min_p * top_probs - # Create a mask for tokens that have a probability less than the scaled min_p - tokens_to_remove = probs < scaled_min_p - - sorted_indices = torch.argsort(scores, descending=True, dim=-1) - sorted_indices_to_remove = torch.gather(tokens_to_remove, dim=-1, index=sorted_indices) - - if self.min_tokens_to_keep > 1: - # Keep at least min_tokens_to_keep - sorted_indices_to_remove[..., : self.min_tokens_to_keep] = False - - indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove) - scores = scores.masked_fill(indices_to_remove, self.filter_value) - return scores - - class TailFreeLogitsWarper(LogitsWarper): def __init__(self, tfs: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): tfs = float(tfs) @@ -220,6 +191,78 @@ class TopALogitsWarper(LogitsWarper): return scores +class DRYLogitsProcessor(LogitsProcessor): + def __init__(self, multiplier: float, base: float, allowed_length: int, sequence_breakers: set[int], _range: int): + self.multiplier = multiplier + self.base = base + self.allowed_length = allowed_length + self.sequence_breakers = sequence_breakers + self._range = _range + + def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + if self._range > 0: + input_ids = input_ids[:, -self._range:] + + for input_ids_row, scores_row in zip(input_ids, scores): + # Use normal Python data types for improved performance + input_ids = input_ids_row.tolist() + + last_token = input_ids[-1] + if last_token in self.sequence_breakers: + continue + + # Exclude the last token as it always matches. + match_indices = [] + for idx, val in enumerate(input_ids[:-1]): + if val == last_token: + match_indices.append(idx) + + # Stores the maximum matching sequence length + # for each token immediately following the sequence in the input. + match_lengths = {} + + for i in match_indices: + next_token = input_ids[i + 1] + + if next_token in self.sequence_breakers: + continue + + # We have already found that `last_token` matches at this index, + # so the match is at least of length 1. + match_length = 1 + + # Extend the match backwards (at most to 50 to prevent exponent overflow at penalty calculation) (this cap also improves performance on worst case) + while match_length < 50: + j = i - match_length + if j < 0: + # Start of input reached. + break + + previous_token = input_ids[-(match_length + 1)] + if input_ids[j] != previous_token: + # Start of match reached. + break + + if previous_token in self.sequence_breakers: + # Sequence-breaking token reached. + break + + match_length += 1 + + if next_token in match_lengths: + match_lengths[next_token] = max(match_length, match_lengths[next_token]) + else: + match_lengths[next_token] = match_length + + # Apply penalties. + for token, match_length in match_lengths.items(): + if match_length >= self.allowed_length: + penalty = self.multiplier * self.base ** (match_length - self.allowed_length) + scores_row[token] -= penalty + + return scores + + class MirostatLogitsWarper(LogitsWarper): def __init__(self, mirostat_mode: int, mirostat_tau: float, mirostat_eta: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1): if mirostat_mode not in [2]: @@ -316,14 +359,14 @@ class RepetitionPenaltyLogitsProcessorWithRange(LogitsProcessor): return scores -def get_logits_warper_patch(self, generation_config): +def get_logits_warper_patch(self, generation_config, **kwargs): # Parameter sanitization if isinstance(generation_config.temperature, int): generation_config.temperature = float(generation_config.temperature) # Must be float # Get the original warpers - warpers = self._get_logits_warper_old(generation_config) + warpers = self._get_logits_warper_old(generation_config, **kwargs) # Replace temperature with our modified class. # Currently, it behaves identically to the original. @@ -352,14 +395,6 @@ def get_logits_warper_patch(self, generation_config): ) ) - if generation_config.min_p is not None and 0.0 < generation_config.min_p <= 1.0: - warpers_to_add.append( - MinPLogitsWarper( - min_p=generation_config.min_p, - min_tokens_to_keep=min_tokens_to_keep - ) - ) - if generation_config.dynamic_temperature: warpers_to_add.append( DynamicTemperatureLogitsWarper( @@ -448,20 +483,44 @@ def get_logits_warper_patch(self, generation_config): def get_logits_processor_patch(self, **kwargs): - repetition_penalty = kwargs['generation_config'].repetition_penalty - presence_penalty = kwargs['generation_config'].presence_penalty - frequency_penalty = kwargs['generation_config'].frequency_penalty - repetition_penalty_range = kwargs['generation_config'].repetition_penalty_range - do_rep_pen_hijack = (repetition_penalty > 1) or (presence_penalty != 0) or (frequency_penalty != 0) + generation_config = kwargs['generation_config'] + + do_rep_pen_hijack = (generation_config.repetition_penalty > 1) or (generation_config.presence_penalty != 0) or (generation_config.frequency_penalty != 0) if do_rep_pen_hijack: - kwargs['generation_config'].repetition_penalty = 1.1 # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created + generation_config.repetition_penalty = 1.1 # Set to value > 1 to ensure RepetitionPenaltyLogitsProcessor is created result = self._get_logits_processor_old(**kwargs) if do_rep_pen_hijack: for i in range(len(result)): if result[i].__class__.__name__ == 'RepetitionPenaltyLogitsProcessor': - result[i] = RepetitionPenaltyLogitsProcessorWithRange(repetition_penalty, presence_penalty, frequency_penalty, repetition_penalty_range) + result[i] = RepetitionPenaltyLogitsProcessorWithRange( + generation_config.repetition_penalty, + generation_config.presence_penalty, + generation_config.frequency_penalty, + generation_config.repetition_penalty_range + ) + + if generation_config.dry_multiplier is not None and generation_config.dry_multiplier > 0.0: + dry_sequence_breakers = generation_config.dry_sequence_breakers + + # Support both JSON array notation and comma-separated strings. + if not dry_sequence_breakers.startswith("["): + dry_sequence_breakers = "[" + dry_sequence_breakers + "]" + + sequence_breaker_strings = json.loads(dry_sequence_breakers) + # Prefix with 'a' to get the correct encoding of the token at the end of a text. + sequence_breakers = {shared.tokenizer.encode(f'a{s}')[-1] for s in sequence_breaker_strings} + + result.append( + DRYLogitsProcessor( + multiplier=generation_config.dry_multiplier, + base=generation_config.dry_base, + allowed_length=generation_config.dry_allowed_length, + sequence_breakers=sequence_breakers, + _range=generation_config.repetition_penalty_range, + ) + ) return result @@ -483,6 +542,10 @@ def generation_config_init_patch(self, **kwargs): self.repetition_penalty_range = kwargs.pop("repetition_penalty_range", 0) self.presence_penalty = kwargs.pop("presence_penalty", 0) self.frequency_penalty = kwargs.pop("frequency_penalty", 0) + self.dry_multiplier = kwargs.pop("dry_multiplier", 0.0) + self.dry_base = kwargs.pop("dry_base", 1.75) + self.dry_allowed_length = kwargs.pop("dry_allowed_length", 2) + self.dry_sequence_breakers = kwargs.pop("dry_sequence_breakers", '"\\n", ":", "\\"", "*"') self.temperature_last = kwargs.pop("temperature_last", False) self.sampler_priority = kwargs.pop("sampler_priority", ['temperature', 'dynamic_temperature', 'quadratic_sampling', 'top_k', 'top_p', 'typical_p', 'epsilon_cutoff', 'eta_cutoff', 'tfs', 'top_a', 'min_p', 'mirostat']) diff --git a/modules/shared.py b/modules/shared.py old mode 100755 new mode 100644 index 67c65c30a21b798cbd04b6141b13827e6f67e671..43533a1480914ddc7682b29be35b0b8a9b24b1b0 --- a/modules/shared.py +++ b/modules/shared.py @@ -32,7 +32,7 @@ settings = { 'dark_theme': True, 'show_controls': True, 'start_with': '', - 'mode': 'chat', + 'mode': 'chat-instruct', 'chat_style': 'cai-chat', 'prompt-default': 'QA', 'prompt-notebook': 'QA', @@ -43,8 +43,6 @@ settings = { 'negative_prompt': '', 'seed': -1, 'truncation_length': 2048, - 'truncation_length_min': 0, - 'truncation_length_max': 200000, 'max_tokens_second': 0, 'max_updates_second': 0, 'prompt_lookup_num_tokens': 0, @@ -84,10 +82,11 @@ group.add_argument('--settings', type=str, help='Load the default interface sett group.add_argument('--extensions', type=str, nargs='+', help='The list of extensions to load. If you want to load more than one extension, write the names separated by spaces.') group.add_argument('--verbose', action='store_true', help='Print the prompts to the terminal.') group.add_argument('--chat-buttons', action='store_true', help='Show buttons on the chat tab instead of a hover menu.') +group.add_argument('--idle-timeout', type=int, default=0, help='Unload model after this many minutes of inactivity. It will be automatically reloaded when you try to use it again.') # Model loader group = parser.add_argument_group('Model loader') -group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ, AutoAWQ, GPTQ-for-LLaMa, QuIP#.') +group.add_argument('--loader', type=str, help='Choose the model loader manually, otherwise, it will get autodetected. Valid options: Transformers, llama.cpp, llamacpp_HF, ExLlamav2_HF, ExLlamav2, AutoGPTQ.') # Transformers/Accelerate group = parser.add_argument_group('Transformers/Accelerate') @@ -104,6 +103,7 @@ group.add_argument('--trust-remote-code', action='store_true', help='Set trust_r group.add_argument('--force-safetensors', action='store_true', help='Set use_safetensors=True while loading the model. This prevents arbitrary code execution.') group.add_argument('--no_use_fast', action='store_true', help='Set use_fast=False while loading the tokenizer (it\'s True by default). Use this if you have any problems related to use_fast.') group.add_argument('--use_flash_attention_2', action='store_true', help='Set use_flash_attention_2=True while loading the model.') +group.add_argument('--use_eager_attention', action='store_true', help='Set attn_implementation= eager while loading the model.') # bitsandbytes 4-bit group = parser.add_argument_group('bitsandbytes 4-bit') @@ -114,7 +114,8 @@ group.add_argument('--quant_type', type=str, default='nf4', help='quant_type for # llama.cpp group = parser.add_argument_group('llama.cpp') -group.add_argument('--tensorcores', action='store_true', help='Use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards. NVIDIA only.') +group.add_argument('--flash-attn', action='store_true', help='Use flash-attention.') +group.add_argument('--tensorcores', action='store_true', help='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.') group.add_argument('--n_ctx', type=int, default=2048, help='Size of the prompt context.') group.add_argument('--threads', type=int, default=0, help='Number of threads to use.') group.add_argument('--threads-batch', type=int, default=0, help='Number of threads to use for batches/prompt processing.') @@ -123,7 +124,7 @@ group.add_argument('--n_batch', type=int, default=512, help='Maximum number of p group.add_argument('--no-mmap', action='store_true', help='Prevent mmap from being used.') group.add_argument('--mlock', action='store_true', help='Force the system to keep the model in RAM.') group.add_argument('--n-gpu-layers', type=int, default=0, help='Number of layers to offload to the GPU.') -group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 18,17.') +group.add_argument('--tensor_split', type=str, default=None, help='Split the model across multiple GPUs. Comma-separated list of proportions. Example: 60,40.') group.add_argument('--numa', action='store_true', help='Activate NUMA task allocation for llama.cpp.') group.add_argument('--logits_all', action='store_true', help='Needs to be set for perplexity evaluation to work. Otherwise, ignore it, as it makes prompt processing slower.') group.add_argument('--no_offload_kqv', action='store_true', help='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') @@ -131,6 +132,7 @@ group.add_argument('--cache-capacity', type=str, help='Maximum cache capacity (l group.add_argument('--row_split', action='store_true', help='Split the model by rows across GPUs. This may improve multi-gpu performance.') group.add_argument('--streaming-llm', action='store_true', help='Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') group.add_argument('--attention-sink-size', type=int, default=5, help='StreamingLLM: number of sink tokens. Only used if the trimmed prompt does not share a prefix with the old prompt.') +group.add_argument('--tokenizer-dir', type=str, help='Load the tokenizer from this folder. Meant to be used with llamacpp_HF through the command-line.') # ExLlamaV2 group = parser.add_argument_group('ExLlamaV2') @@ -139,6 +141,8 @@ group.add_argument('--autosplit', action='store_true', help='Autosplit the model group.add_argument('--max_seq_len', type=int, default=2048, help='Maximum sequence length.') group.add_argument('--cfg-cache', action='store_true', help='ExLlamav2_HF: Create an additional cache for CFG negative prompts. Necessary to use CFG with that loader.') group.add_argument('--no_flash_attn', action='store_true', help='Force flash-attention to not be used.') +group.add_argument('--no_xformers', action='store_true', help='Force xformers to not be used.') +group.add_argument('--no_sdpa', action='store_true', help='Force Torch SDPA to not be used.') group.add_argument('--cache_8bit', action='store_true', help='Use 8-bit cache to save VRAM.') group.add_argument('--cache_4bit', action='store_true', help='Use Q4 cache to save VRAM.') group.add_argument('--num_experts_per_token', type=int, default=2, help='Number of experts to use for generation. Applies to MoE models like Mixtral.') @@ -146,26 +150,22 @@ group.add_argument('--num_experts_per_token', type=int, default=2, help='Number # AutoGPTQ group = parser.add_argument_group('AutoGPTQ') group.add_argument('--triton', action='store_true', help='Use triton.') -group.add_argument('--no_inject_fused_attention', action='store_true', help='Disable the use of fused attention, which will use less VRAM at the cost of slower inference.') group.add_argument('--no_inject_fused_mlp', action='store_true', help='Triton mode only: disable the use of fused MLP, which will use less VRAM at the cost of slower inference.') group.add_argument('--no_use_cuda_fp16', action='store_true', help='This can make models faster on some systems.') group.add_argument('--desc_act', action='store_true', help='For models that do not have a quantize_config.json, this parameter is used to define whether to set desc_act or not in BaseQuantizeConfig.') group.add_argument('--disable_exllama', action='store_true', help='Disable ExLlama kernel, which can improve inference speed on some systems.') group.add_argument('--disable_exllamav2', action='store_true', help='Disable ExLlamav2 kernel.') - -# GPTQ-for-LLaMa -group = parser.add_argument_group('GPTQ-for-LLaMa') group.add_argument('--wbits', type=int, default=0, help='Load a pre-quantized model with specified precision in bits. 2, 3, 4 and 8 are supported.') -group.add_argument('--model_type', type=str, help='Model type of pre-quantized model. Currently LLaMA, OPT, and GPT-J are supported.') group.add_argument('--groupsize', type=int, default=-1, help='Group size.') -group.add_argument('--pre_layer', type=int, nargs='+', help='The number of layers to allocate to the GPU. Setting this parameter enables CPU offloading for 4-bit models. For multi-gpu, write the numbers separated by spaces, eg --pre_layer 30 60.') -group.add_argument('--checkpoint', type=str, help='The path to the quantized checkpoint file. If not specified, it will be automatically detected.') -group.add_argument('--monkey-patch', action='store_true', help='Apply the monkey patch for using LoRAs with quantized models.') # HQQ group = parser.add_argument_group('HQQ') group.add_argument('--hqq-backend', type=str, default='PYTORCH_COMPILE', help='Backend for the HQQ loader. Valid options: PYTORCH, PYTORCH_COMPILE, ATEN.') +# TensorRT-LLM +group = parser.add_argument_group('TensorRT-LLM') +group.add_argument('--cpp-runner', action='store_true', help='Use the ModelRunnerCpp runner, which is faster than the default ModelRunner but doesn\'t support streaming yet.') + # DeepSpeed group = parser.add_argument_group('DeepSpeed') group.add_argument('--deepspeed', action='store_true', help='Enable the use of DeepSpeed ZeRO-3 for inference via the Transformers integration.') @@ -189,6 +189,7 @@ group.add_argument('--gradio-auth', type=str, help='Set Gradio authentication pa group.add_argument('--gradio-auth-path', type=str, help='Set the Gradio authentication file path. The file should contain one or more user:password pairs in the same format as above.', default=None) group.add_argument('--ssl-keyfile', type=str, help='The path to the SSL certificate key file.', default=None) group.add_argument('--ssl-certfile', type=str, help='The path to the SSL certificate cert file.', default=None) +group.add_argument('--subpath', type=str, help='Customize the subpath for gradio, use with reverse proxy') # API group = parser.add_argument_group('API') @@ -205,7 +206,12 @@ group = parser.add_argument_group('Multimodal') group.add_argument('--multimodal-pipeline', type=str, default=None, help='The multimodal pipeline to use. Examples: llava-7b, llava-13b.') # Deprecated parameters -# group = parser.add_argument_group('Deprecated') +group = parser.add_argument_group('Deprecated') +group.add_argument('--model_type', type=str, help='DEPRECATED') +group.add_argument('--pre_layer', type=int, nargs='+', help='DEPRECATED') +group.add_argument('--checkpoint', type=str, help='DEPRECATED') +group.add_argument('--monkey-patch', action='store_true', help='DEPRECATED') +group.add_argument('--no_inject_fused_attention', action='store_true', help='DEPRECATED') args = parser.parse_args() args_defaults = parser.parse_args([]) @@ -250,20 +256,16 @@ def fix_loader_name(name): return 'Transformers' elif name in ['autogptq', 'auto-gptq', 'auto_gptq', 'auto gptq']: return 'AutoGPTQ' - elif name in ['gptq-for-llama', 'gptqforllama', 'gptqllama', 'gptq for llama', 'gptq_for_llama']: - return 'GPTQ-for-LLaMa' elif name in ['exllama', 'ex-llama', 'ex_llama', 'exlama']: return 'ExLlama' elif name in ['exllamav2', 'exllama-v2', 'ex_llama-v2', 'exlamav2', 'exlama-v2', 'exllama2', 'exllama-2']: return 'ExLlamav2' elif name in ['exllamav2-hf', 'exllamav2_hf', 'exllama-v2-hf', 'exllama_v2_hf', 'exllama-v2_hf', 'exllama2-hf', 'exllama2_hf', 'exllama-2-hf', 'exllama_2_hf', 'exllama-2_hf']: return 'ExLlamav2_HF' - elif name in ['autoawq', 'awq', 'auto-awq']: - return 'AutoAWQ' - elif name in ['quip#', 'quip-sharp', 'quipsharp', 'quip_sharp']: - return 'QuIP#' elif name in ['hqq']: return 'HQQ' + elif name in ['tensorrt', 'tensorrtllm', 'tensorrt_llm', 'tensorrt-llm', 'tensort', 'tensortllm']: + return 'TensorRT-LLM' def add_extension(name, last=False): diff --git a/modules/tensorrt_llm.py b/modules/tensorrt_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..c2685b7598000c0d835518d3e8222578ac9047f2 --- /dev/null +++ b/modules/tensorrt_llm.py @@ -0,0 +1,131 @@ +from pathlib import Path + +import tensorrt_llm +import torch +from tensorrt_llm.runtime import ModelRunner, ModelRunnerCpp + +from modules import shared +from modules.logging_colors import logger +from modules.text_generation import ( + get_max_prompt_length, + get_reply_from_output_ids +) + + +class TensorRTLLMModel: + def __init__(self): + pass + + @classmethod + def from_pretrained(self, path_to_model): + + path_to_model = Path(f'{shared.args.model_dir}') / Path(path_to_model) + runtime_rank = tensorrt_llm.mpi_rank() + + # Define model settings + runner_kwargs = dict( + engine_dir=str(path_to_model), + lora_dir=None, + rank=runtime_rank, + debug_mode=False, + lora_ckpt_source="hf", + ) + + if shared.args.cpp_runner: + logger.info("TensorRT-LLM: Using \"ModelRunnerCpp\"") + runner_kwargs.update( + max_batch_size=1, + max_input_len=shared.args.max_seq_len - 512, + max_output_len=512, + max_beam_width=1, + max_attention_window_size=None, + sink_token_length=None, + ) + else: + logger.info("TensorRT-LLM: Using \"ModelRunner\"") + + # Load the model + runner_cls = ModelRunnerCpp if shared.args.cpp_runner else ModelRunner + runner = runner_cls.from_dir(**runner_kwargs) + + result = self() + result.model = runner + result.runtime_rank = runtime_rank + + return result + + def generate_with_streaming(self, prompt, state): + batch_input_ids = [] + input_ids = shared.tokenizer.encode( + prompt, + add_special_tokens=True, + truncation=False, + ) + input_ids = torch.tensor(input_ids, dtype=torch.int32) + input_ids = input_ids[-get_max_prompt_length(state):] # Apply truncation_length + batch_input_ids.append(input_ids) + + if shared.args.cpp_runner: + max_new_tokens = min(512, state['max_new_tokens']) + elif state['auto_max_new_tokens']: + max_new_tokens = state['truncation_length'] - input_ids.shape[-1] + else: + max_new_tokens = state['max_new_tokens'] + + with torch.no_grad(): + generator = self.model.generate( + batch_input_ids, + max_new_tokens=max_new_tokens, + max_attention_window_size=None, + sink_token_length=None, + end_id=shared.tokenizer.eos_token_id if not state['ban_eos_token'] else -1, + pad_id=shared.tokenizer.pad_token_id or shared.tokenizer.eos_token_id, + temperature=state['temperature'], + top_k=state['top_k'], + top_p=state['top_p'], + num_beams=1, + length_penalty=1.0, + repetition_penalty=state['repetition_penalty'], + presence_penalty=state['presence_penalty'], + frequency_penalty=state['frequency_penalty'], + stop_words_list=None, + bad_words_list=None, + lora_uids=None, + prompt_table_path=None, + prompt_tasks=None, + streaming=not shared.args.cpp_runner, + output_sequence_lengths=True, + return_dict=True, + medusa_choices=None + ) + + torch.cuda.synchronize() + + cumulative_reply = '' + starting_from = batch_input_ids[0].shape[-1] + + if shared.args.cpp_runner: + sequence_length = generator['sequence_lengths'][0].item() + output_ids = generator['output_ids'][0][0][:sequence_length].tolist() + + cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) + starting_from = sequence_length + yield cumulative_reply + else: + for curr_outputs in generator: + if shared.stop_everything: + break + + sequence_length = curr_outputs['sequence_lengths'][0].item() + output_ids = curr_outputs['output_ids'][0][0][:sequence_length].tolist() + + cumulative_reply += get_reply_from_output_ids(output_ids, state, starting_from=starting_from) + starting_from = sequence_length + yield cumulative_reply + + def generate(self, prompt, state): + output = '' + for output in self.generate_with_streaming(prompt, state): + pass + + return output diff --git a/modules/text_generation.py b/modules/text_generation.py old mode 100755 new mode 100644 index 17ec6bd891d37e095a8f34d84bc0a549bfc2967c..75e5ef36aec679fb8f0893b5c15f35a036877134 --- a/modules/text_generation.py +++ b/modules/text_generation.py @@ -16,6 +16,7 @@ from transformers import ( ) import modules.shared as shared +from modules import models from modules.cache_utils import process_llamacpp_cache from modules.callbacks import ( Iteratorize, @@ -27,15 +28,19 @@ from modules.grammar.grammar_utils import initialize_grammar from modules.grammar.logits_process import GrammarConstrainedLogitsProcessor from modules.html_generator import generate_basic_html from modules.logging_colors import logger -from modules.models import clear_torch_cache +from modules.models import clear_torch_cache, load_model def generate_reply(*args, **kwargs): + if shared.args.idle_timeout > 0 and shared.model is None and shared.model_name not in [None, 'None']: + shared.model, shared.tokenizer = load_model(shared.model_name) + shared.generation_lock.acquire() try: for result in _generate_reply(*args, **kwargs): yield result finally: + models.last_generation_time = time.time() shared.generation_lock.release() @@ -49,15 +54,14 @@ def _generate_reply(question, state, stopping_strings=None, is_chat=False, escap yield '' return - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']: generate_func = generate_reply_custom else: generate_func = generate_reply_HF if generate_func != generate_reply_HF and shared.args.verbose: logger.info("PROMPT=") - print(question) - print() + print_prompt(question) # Prepare the input original_question = question @@ -128,21 +132,33 @@ def encode(prompt, add_special_tokens=True, add_bos_token=True, truncation_lengt if shared.tokenizer is None: raise ValueError('No tokenizer is loaded') - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model']: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel']: input_ids = shared.tokenizer.encode(str(prompt)) if shared.model.__class__.__name__ not in ['Exllamav2Model']: input_ids = np.array(input_ids).reshape(1, len(input_ids)) else: input_ids = shared.tokenizer.encode(str(prompt), return_tensors='pt', add_special_tokens=add_special_tokens) - if not add_bos_token: - while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id: - input_ids = input_ids[:, 1:] + + if hasattr(shared.tokenizer, 'bos_token_id') and shared.tokenizer.bos_token_id is not None: + if add_bos_token: + if (len(input_ids[0]) > 0 and input_ids[0][0] != shared.tokenizer.bos_token_id) or len(input_ids[0]) == 0: + # Add a missing bos token (it may not have been added due to faulty model metadata) + bos_tensor = torch.tensor([[shared.tokenizer.bos_token_id]]) + input_ids = torch.cat((bos_tensor, input_ids), 1) + + # Prevent double bos token due to jinja templates withsomewhere + while len(input_ids[0]) > 1 and input_ids[0][0] == shared.tokenizer.bos_token_id and input_ids[0][1] == shared.tokenizer.bos_token_id: + input_ids = input_ids[:, 1:] + else: + # Remove any bos token that may have been added + while len(input_ids[0]) > 0 and input_ids[0][0] == shared.tokenizer.bos_token_id: + input_ids = input_ids[:, 1:] # Handling truncation if truncation_length is not None: input_ids = input_ids[:, -truncation_length:] - if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model'] or shared.args.cpu: + if shared.model.__class__.__name__ in ['LlamaCppModel', 'Exllamav2Model', 'TensorRTLLMModel'] or shared.args.cpu: return input_ids elif shared.args.deepspeed: import deepspeed @@ -268,7 +284,7 @@ def get_reply_from_output_ids(output_ids, state=None, starting_from=0): def generate_reply_HF(question, original_question, seed, state, stopping_strings=None, is_chat=False): generate_params = {} - for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size']: + for k in ['max_new_tokens', 'temperature', 'temperature_last', 'dynamic_temperature', 'dynatemp_low', 'dynatemp_high', 'dynatemp_exponent', 'smoothing_factor', 'smoothing_curve', 'top_p', 'min_p', 'top_k', 'repetition_penalty', 'presence_penalty', 'frequency_penalty', 'repetition_penalty_range', 'typical_p', 'tfs', 'top_a', 'guidance_scale', 'penalty_alpha', 'mirostat_mode', 'mirostat_tau', 'mirostat_eta', 'do_sample', 'encoder_repetition_penalty', 'no_repeat_ngram_size', 'dry_multiplier', 'dry_base', 'dry_allowed_length', 'dry_sequence_breakers']: if k in state: generate_params[k] = state[k] @@ -343,8 +359,7 @@ def generate_reply_HF(question, original_question, seed, state, stopping_strings print() logger.info("PROMPT=") - print(decode(input_ids[0], skip_special_tokens=False)) - print() + print_prompt(decode(input_ids[0], skip_special_tokens=False)) # Handle StreamingLLM for llamacpp_HF if shared.model.__class__.__name__ == 'LlamacppHF' and shared.args.streaming_llm: @@ -433,3 +448,18 @@ def generate_reply_custom(question, original_question, seed, state, stopping_str new_tokens = len(encode(original_question + reply)[0]) - original_tokens print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})') return + + +def print_prompt(prompt, max_chars=2000): + DARK_YELLOW = "\033[38;5;3m" + RESET = "\033[0m" + + if len(prompt) > max_chars: + half_chars = max_chars // 2 + hidden_len = len(prompt[half_chars:-half_chars]) + hidden_msg = f"{DARK_YELLOW}[...{hidden_len} characters hidden...]{RESET}" + print(prompt[:half_chars] + hidden_msg + prompt[-half_chars:]) + else: + print(prompt) + + print() diff --git a/modules/training.py b/modules/training.py old mode 100755 new mode 100644 index dd360e7da5b63e17210b96cf5c364695591da433..b003fc8c2827e2e53619be80cb5ad0ba242d272e --- a/modules/training.py +++ b/modules/training.py @@ -165,7 +165,7 @@ def create_ui(): stride_length = gr.Slider(label='Stride', minimum=0, maximum=32768, value=512, step=256, info='Used to make the evaluation faster at the cost of accuracy. 1 = slowest but most accurate. 512 is a common value.') with gr.Column(): - max_length = gr.Slider(label='max_length', minimum=0, maximum=shared.settings['truncation_length_max'], value=0, step=256, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') + max_length = gr.Number(label='max_length', precision=0, step=256, value=0, info='The context for each evaluation. If set to 0, the maximum context length for the model will be used.') with gr.Row(): start_current_evaluation = gr.Button("Evaluate loaded model", interactive=not mu) @@ -292,12 +292,6 @@ def calc_trainable_parameters(model): def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: bool, k_proj_en: bool, o_proj_en: bool, gate_proj_en: bool, down_proj_en: bool, up_proj_en: bool, save_steps: int, micro_batch_size: int, batch_size: int, epochs: int, learning_rate: str, lr_scheduler_type: str, lora_rank: int, lora_alpha: int, lora_dropout: float, cutoff_len: int, dataset: str, eval_dataset: str, format: str, eval_steps: int, raw_text_file: str, overlap_len: int, newline_favor_len: int, higher_rank_limit: bool, warmup_steps: int, optimizer: str, hard_cut_string: str, train_only_after: str, stop_at_loss: float, add_eos_token: bool, min_chars: int, report_to: str): - if shared.args.monkey_patch: - from alpaca_lora_4bit.monkeypatch.peft_tuners_lora_monkey_patch import ( - replace_peft_model_with_int4_lora_model - ) - replace_peft_model_with_int4_lora_model() - global WANT_INTERRUPT WANT_INTERRUPT = False @@ -329,10 +323,6 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: time.sleep(5) - if shared.args.loader == 'GPTQ-for-LLaMa' and not shared.args.monkey_patch: - yield "LoRA training with GPTQ-for-LLaMa requires loading with `--monkey-patch`" - return - if cutoff_len <= 0 or micro_batch_size <= 0 or batch_size <= 0 or actual_lr <= 0 or lora_rank <= 0 or lora_alpha <= 0: yield "Cannot input zeroes." return @@ -553,15 +543,6 @@ def do_train(lora_name: str, always_override: bool, q_proj_en: bool, v_proj_en: yield traceback.format_exc().replace('\n', '\n\n') return - if shared.args.monkey_patch: - from alpaca_lora_4bit.autograd_4bit import Autograd4bitQuantLinear - from alpaca_lora_4bit.models import Linear4bitLt - for _, m in lora_model.named_modules(): - if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt): - if m.is_v1_model: - m.zeros = m.zeros.half() - m.scales = m.scales.half() - class Tracked(): def __init__(self): self.current_steps = 0 diff --git a/modules/ui.py b/modules/ui.py old mode 100755 new mode 100644 index 6694c5f3eb3b0a4bcd306e1f8f979af627cb8d70..47f92cf0f941294f2dca65e2d320ff9a9c37a346 --- a/modules/ui.py +++ b/modules/ui.py @@ -15,8 +15,6 @@ with open(Path(__file__).resolve().parent / '../css/main.css', 'r') as f: css += f.read() with open(Path(__file__).resolve().parent / '../css/katex/katex.min.css', 'r') as f: css += f.read() -with open(Path(__file__).resolve().parent / '../css/highlightjs/github-dark.min.css', 'r') as f: - css += f.read() with open(Path(__file__).resolve().parent / '../css/highlightjs/highlightjs-copy.min.css', 'r') as f: css += f.read() with open(Path(__file__).resolve().parent / '../js/main.js', 'r') as f: @@ -29,6 +27,8 @@ with open(Path(__file__).resolve().parent / '../js/show_controls.js', 'r') as f: show_controls_js = f.read() with open(Path(__file__).resolve().parent / '../js/update_big_picture.js', 'r') as f: update_big_picture_js = f.read() +with open(Path(__file__).resolve().parent / '../js/dark_theme.js', 'r') as f: + dark_theme_js = f.read() refresh_symbol = '🔄' delete_symbol = '🗑️' @@ -43,6 +43,11 @@ theme = gr.themes.Default( body_text_color_subdued='#484848', background_fill_secondary='#eaeaea', background_fill_primary='var(--neutral-50)', + body_background_fill="white", + block_background_fill="#f4f4f4", + body_text_color="#333", + button_secondary_background_fill="#f4f4f4", + button_secondary_border_color="var(--border-color-primary)" ) if Path("notification.mp3").exists(): @@ -64,23 +69,23 @@ def list_model_elements(): 'trust_remote_code', 'no_use_fast', 'use_flash_attention_2', + 'use_eager_attention', 'load_in_4bit', 'compute_dtype', 'quant_type', 'use_double_quant', 'wbits', 'groupsize', - 'model_type', - 'pre_layer', 'triton', 'desc_act', - 'no_inject_fused_attention', 'no_inject_fused_mlp', 'no_use_cuda_fp16', 'disable_exllama', 'disable_exllamav2', 'cfg_cache', 'no_flash_attn', + 'no_xformers', + 'no_sdpa', 'num_experts_per_token', 'cache_8bit', 'cache_4bit', @@ -104,10 +109,13 @@ def list_model_elements(): 'no_offload_kqv', 'row_split', 'tensorcores', + 'flash_attn', 'streaming_llm', 'attention_sink_size', 'hqq_backend', + 'cpp_runner', ] + if is_torch_xpu_available(): for i in range(torch.xpu.device_count()): elements.append(f'gpu_memory_{i}') @@ -146,6 +154,10 @@ def list_interface_input_elements(): 'repetition_penalty_range', 'encoder_repetition_penalty', 'no_repeat_ngram_size', + 'dry_multiplier', + 'dry_base', + 'dry_allowed_length', + 'dry_sequence_breakers', 'do_sample', 'penalty_alpha', 'mirostat_mode', @@ -172,6 +184,7 @@ def list_interface_input_elements(): 'start_with', 'character_menu', 'history', + 'unique_id', 'name1', 'user_bio', 'name2', @@ -201,9 +214,11 @@ def list_interface_input_elements(): def gather_interface_values(*args): + interface_elements = list_interface_input_elements() + output = {} - for i, element in enumerate(list_interface_input_elements()): - output[element] = args[i] + for element, value in zip(interface_elements, args): + output[element] = value if not shared.args.multi_user: shared.persistent_interface_state = output @@ -214,8 +229,14 @@ def gather_interface_values(*args): def apply_interface_values(state, use_persistent=False): if use_persistent: state = shared.persistent_interface_state + if 'textbox-default' in state: + state.pop('prompt_menu-default') + + if 'textbox-notebook' in state: + state.pop('prompt_menu-notebook') elements = list_interface_input_elements() + if len(state) == 0: return [gr.update() for k in elements] # Dummy, do nothing else: @@ -224,7 +245,7 @@ def apply_interface_values(state, use_persistent=False): def save_settings(state, preset, extensions_list, show_controls, theme_state): output = copy.deepcopy(shared.settings) - exclude = ['name2', 'greeting', 'context', 'turn_template', 'truncation_length'] + exclude = ['name2', 'greeting', 'context', 'truncation_length', 'instruction_template_str'] for k in state: if k in shared.settings and k not in exclude: output[k] = state[k] @@ -256,7 +277,7 @@ def save_settings(state, preset, extensions_list, show_controls, theme_state): if key in shared.default_settings and output[key] == shared.default_settings[key]: output.pop(key) - return yaml.dump(output, sort_keys=False, width=float("inf")) + return yaml.dump(output, sort_keys=False, width=float("inf"), allow_unicode=True) def create_refresh_button(refresh_component, refresh_method, refreshed_args, elem_class, interactive=True): diff --git a/modules/ui_chat.py b/modules/ui_chat.py old mode 100755 new mode 100644 index 3193bd67489db446dccdd63e55689fb429601be6..57143cd8c0de7f46d4324659e3caa637894ea3fd --- a/modules/ui_chat.py +++ b/modules/ui_chat.py @@ -19,7 +19,7 @@ def create_ui(): mu = shared.args.multi_user shared.gradio['Chat input'] = gr.State() - shared.gradio['history'] = gr.State({'internal': [], 'visible': []}) + shared.gradio['history'] = gr.JSON({'internal': [], 'visible': []}, visible=False) with gr.Tab('Chat', elem_id='chat-tab', elem_classes=("old-ui" if shared.args.chat_buttons else None)): with gr.Row(): @@ -62,9 +62,6 @@ def create_ui(): with gr.Row(elem_id='past-chats-row', elem_classes=['pretty_scrollbar']): with gr.Column(): - with gr.Row(): - shared.gradio['unique_id'] = gr.Dropdown(label='Past chats', elem_classes=['slim-dropdown'], interactive=not mu) - with gr.Row(): shared.gradio['rename_chat'] = gr.Button('Rename', elem_classes='refresh-button', interactive=not mu) shared.gradio['delete_chat'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) @@ -74,8 +71,13 @@ def create_ui(): with gr.Row(elem_id='rename-row'): shared.gradio['rename_to'] = gr.Textbox(label='Rename to:', placeholder='New name', visible=False, elem_classes=['no-background']) - shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) - shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) + with gr.Row(): + shared.gradio['rename_to-confirm'] = gr.Button('Confirm', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) + shared.gradio['rename_to-cancel'] = gr.Button('Cancel', visible=False, elem_classes=['refresh-button', 'focus-on-chat-input']) + + gr.Markdown("Past chats") + with gr.Row(): + shared.gradio['unique_id'] = gr.Radio(label="", elem_classes=['slim-dropdown', 'pretty_scrollbar'], interactive=not mu, elem_id='past-chats') with gr.Row(elem_id='chat-controls', elem_classes=['pretty_scrollbar']): with gr.Column(): @@ -83,13 +85,13 @@ def create_ui(): shared.gradio['start_with'] = gr.Textbox(label='Start reply with', placeholder='Sure thing!', value=shared.settings['start_with'], elem_classes=['add_scrollbar']) with gr.Row(): - shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value='chat', label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template selected under Parameters > Instruction template must match the current model.', elem_id='chat-mode') + shared.gradio['mode'] = gr.Radio(choices=['chat', 'chat-instruct', 'instruct'], value=shared.settings['mode'] if shared.settings['mode'] in ['chat', 'chat-instruct'] else None, label='Mode', info='Defines how the chat prompt is generated. In instruct and chat-instruct modes, the instruction template Parameters > Instruction template is used.', elem_id='chat-mode') with gr.Row(): shared.gradio['chat_style'] = gr.Dropdown(choices=utils.get_available_chat_styles(), label='Chat style', value=shared.settings['chat_style'], visible=shared.settings['mode'] != 'instruct') with gr.Row(): - shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=16, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=False, elem_classes=['add_scrollbar']) + shared.gradio['chat-instruct_command'] = gr.Textbox(value=shared.settings['chat-instruct_command'], lines=12, label='Command for chat-instruct mode', info='<|character|> and <|prompt|> get replaced with the bot name and the regular chat prompt respectively.', visible=shared.settings['mode'] == 'chat-instruct', elem_classes=['add_scrollbar']) def create_chat_settings_ui(): @@ -101,7 +103,7 @@ def create_chat_settings_ui(): with gr.Row(): shared.gradio['character_menu'] = gr.Dropdown(value=None, choices=utils.get_available_characters(), label='Character', elem_id='character-menu', info='Used in chat and chat-instruct modes.', elem_classes='slim-dropdown') ui.create_refresh_button(shared.gradio['character_menu'], lambda: None, lambda: {'choices': utils.get_available_characters()}, 'refresh-button', interactive=not mu) - shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', interactive=not mu) + shared.gradio['save_character'] = gr.Button('💾', elem_classes='refresh-button', elem_id="save-character", interactive=not mu) shared.gradio['delete_character'] = gr.Button('🗑️', elem_classes='refresh-button', interactive=not mu) shared.gradio['name2'] = gr.Textbox(value='', lines=1, label='Character\'s name') @@ -135,7 +137,7 @@ def create_chat_settings_ui(): shared.gradio['tavern_json'] = gr.State() with gr.Column(): shared.gradio['tavern_name'] = gr.Textbox(value='', lines=1, label='Name', interactive=False) - shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=4, max_lines=4, label='Description', interactive=False) + shared.gradio['tavern_desc'] = gr.Textbox(value='', lines=10, label='Description', interactive=False, elem_classes=['add_scrollbar']) shared.gradio['Submit tavern character'] = gr.Button(value='Submit', interactive=False) @@ -179,203 +181,145 @@ def create_event_handlers(): ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['textbox'].submit( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: (x, ''), gradio('textbox'), gradio('Chat input', 'textbox'), show_progress=False).then( chat.generate_chat_reply_wrapper, gradio(inputs), gradio('display', 'history'), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Regenerate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( partial(chat.generate_chat_reply_wrapper, regenerate=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Continue'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( partial(chat.generate_chat_reply_wrapper, _continue=True), gradio(inputs), gradio('display', 'history'), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Impersonate'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( lambda x: x, gradio('textbox'), gradio('Chat input'), show_progress=False).then( chat.impersonate_wrapper, gradio(inputs), gradio('textbox', 'display'), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Replace last reply'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.replace_last_reply, gradio('textbox', 'interface_state'), gradio('history')).then( - lambda: '', None, gradio('textbox'), show_progress=False).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None) + chat.handle_replace_last_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) shared.gradio['Send dummy message'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.send_dummy_message, gradio('textbox', 'interface_state'), gradio('history')).then( - lambda: '', None, gradio('textbox'), show_progress=False).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None) + chat.handle_send_dummy_message_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) shared.gradio['Send dummy reply'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.send_dummy_reply, gradio('textbox', 'interface_state'), gradio('history')).then( - lambda: '', None, gradio('textbox'), show_progress=False).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None) + chat.handle_send_dummy_reply_click, gradio('textbox', 'interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) shared.gradio['Remove last'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.remove_last_message, gradio('history'), gradio('textbox', 'history'), show_progress=False).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None) + chat.handle_remove_last_click, gradio('interface_state'), gradio('history', 'display', 'textbox'), show_progress=False) shared.gradio['Stop'].click( stop_everything_event, None, None, queue=False).then( - chat.redraw_html, gradio(reload_arr), gradio('display')) + chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False) if not shared.args.multi_user: shared.gradio['unique_id'].select( - chat.load_history, gradio('unique_id', 'character_menu', 'mode'), gradio('history')).then( - chat.redraw_html, gradio(reload_arr), gradio('display')) + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_unique_id_select, gradio('interface_state'), gradio('history', 'display'), show_progress=False) shared.gradio['Start new chat'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.start_new_chat, gradio('interface_state'), gradio('history')).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')) + chat.handle_start_new_chat_click, gradio('interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False) shared.gradio['delete_chat'].click(lambda: [gr.update(visible=True), gr.update(visible=False), gr.update(visible=True)], None, gradio(clear_arr)) shared.gradio['delete_chat-cancel'].click(lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr)) shared.gradio['delete_chat-confirm'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x, y: str(chat.find_all_histories(x).index(y)), gradio('interface_state', 'unique_id'), gradio('temporary_text')).then( - chat.delete_history, gradio('unique_id', 'character_menu', 'mode'), None).then( - chat.load_history_after_deletion, gradio('interface_state', 'temporary_text'), gradio('history', 'unique_id')).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda: [gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)], None, gradio(clear_arr)) - - shared.gradio['rename_chat'].click( - lambda x: x, gradio('unique_id'), gradio('rename_to')).then( - lambda: [gr.update(visible=True)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) - - shared.gradio['rename_to-cancel'].click( - lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) + chat.handle_delete_chat_confirm_click, gradio('interface_state'), gradio('history', 'display', 'unique_id') + gradio(clear_arr), show_progress=False) + shared.gradio['rename_chat'].click(chat.handle_rename_chat_click, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) + shared.gradio['rename_to-cancel'].click(lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) shared.gradio['rename_to-confirm'].click( - chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then( - lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then( - lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id')) + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) shared.gradio['rename_to'].submit( - chat.rename_history, gradio('unique_id', 'rename_to', 'character_menu', 'mode'), None).then( - lambda: [gr.update(visible=False)] * 3, None, gradio('rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False).then( - lambda x, y: gr.update(choices=chat.find_all_histories(x), value=y), gradio('interface_state', 'rename_to'), gradio('unique_id')) + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_rename_chat_confirm, gradio('rename_to', 'interface_state'), gradio('unique_id', 'rename_to', 'rename_to-confirm', 'rename_to-cancel'), show_progress=False) shared.gradio['load_chat_history'].upload( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.start_new_chat, gradio('interface_state'), gradio('history')).then( - chat.load_history_json, gradio('load_chat_history', 'history'), gradio('history')).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then( - chat.save_history, gradio('history', 'unique_id', 'character_menu', 'mode'), None).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}') + chat.handle_upload_chat_history, gradio('load_chat_history', 'interface_state'), gradio('history', 'display', 'unique_id'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_chat()}}') shared.gradio['character_menu'].change( - chat.load_character, gradio('character_menu', 'name1', 'name2'), gradio('name1', 'name2', 'character_picture', 'greeting', 'context')).success( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.load_latest_history, gradio('interface_state'), gradio('history')).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')).then( - lambda: None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}') + chat.handle_character_menu_change, gradio('interface_state'), gradio('history', 'display', 'name1', 'name2', 'character_picture', 'greeting', 'context', 'unique_id'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.update_big_picture_js}; updateBigPicture()}}') shared.gradio['mode'].change( - lambda x: [gr.update(visible=x != 'instruct'), gr.update(visible=x == 'chat-instruct')], gradio('mode'), gradio('chat_style', 'chat-instruct_command'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - chat.load_latest_history, gradio('interface_state'), gradio('history')).then( - chat.redraw_html, gradio(reload_arr), gradio('display')).then( - lambda x: gr.update(choices=(histories := chat.find_all_histories(x)), value=histories[0]), gradio('interface_state'), gradio('unique_id')) + chat.handle_mode_change, gradio('interface_state'), gradio('history', 'display', 'chat_style', 'chat-instruct_command', 'unique_id'), show_progress=False).then( + None, gradio('mode'), None, js="(mode) => {mode === 'instruct' ? document.getElementById('character-menu').parentNode.parentNode.style.display = 'none' : document.getElementById('character-menu').parentNode.parentNode.style.display = ''}") - shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display')) + shared.gradio['chat_style'].change(chat.redraw_html, gradio(reload_arr), gradio('display'), show_progress=False) shared.gradio['Copy last reply'].click(chat.send_last_reply_to_input, gradio('history'), gradio('textbox'), show_progress=False) # Save/delete a character - shared.gradio['save_character'].click( - lambda x: x, gradio('name2'), gradio('save_character_filename')).then( - lambda: gr.update(visible=True), None, gradio('character_saver')) - - shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter')) - - shared.gradio['load_template'].click( - chat.load_instruction_template, gradio('instruction_template'), gradio('instruction_template_str')).then( - lambda: "Select template to load...", None, gradio('instruction_template')) - + shared.gradio['save_character'].click(chat.handle_save_character_click, gradio('name2'), gradio('save_character_filename', 'character_saver'), show_progress=False) + shared.gradio['delete_character'].click(lambda: gr.update(visible=True), None, gradio('character_deleter'), show_progress=False) + shared.gradio['load_template'].click(chat.handle_load_template_click, gradio('instruction_template'), gradio('instruction_template_str', 'instruction_template'), show_progress=False) shared.gradio['save_template'].click( - lambda: 'My Template.yaml', None, gradio('save_filename')).then( - lambda: 'instruction-templates/', None, gradio('save_root')).then( - chat.generate_instruction_template_yaml, gradio('instruction_template_str'), gradio('save_contents')).then( - lambda: gr.update(visible=True), None, gradio('file_saver')) - - shared.gradio['delete_template'].click( - lambda x: f'{x}.yaml', gradio('instruction_template'), gradio('delete_filename')).then( - lambda: 'instruction-templates/', None, gradio('delete_root')).then( - lambda: gr.update(visible=True), None, gradio('file_deleter')) + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_save_template_click, gradio('instruction_template_str'), gradio('save_filename', 'save_root', 'save_contents', 'file_saver'), show_progress=False) + shared.gradio['delete_template'].click(chat.handle_delete_template_click, gradio('instruction_template'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False) shared.gradio['save_chat_history'].click( lambda x: json.dumps(x, indent=4), gradio('history'), gradio('temporary_text')).then( None, gradio('temporary_text', 'character_menu', 'mode'), None, js=f'(hist, char, mode) => {{{ui.save_files_js}; saveHistory(hist, char, mode)}}') shared.gradio['Submit character'].click( - chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') + chat.upload_character, gradio('upload_json', 'upload_img_bot'), gradio('character_menu'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') shared.gradio['Submit tavern character'].click( - chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') + chat.upload_tavern_character, gradio('upload_img_tavern', 'tavern_json'), gradio('character_menu'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_character()}}') shared.gradio['upload_json'].upload(lambda: gr.update(interactive=True), None, gradio('Submit character')) shared.gradio['upload_json'].clear(lambda: gr.update(interactive=False), None, gradio('Submit character')) shared.gradio['upload_img_tavern'].upload(chat.check_tavern_character, gradio('upload_img_tavern'), gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False) shared.gradio['upload_img_tavern'].clear(lambda: (None, None, None, gr.update(interactive=False)), None, gradio('tavern_name', 'tavern_desc', 'tavern_json', 'Submit tavern character'), show_progress=False) shared.gradio['your_picture'].change( - chat.upload_your_profile_picture, gradio('your_picture'), None).then( - partial(chat.redraw_html, reset_cache=True), gradio(reload_arr), gradio('display')) + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + chat.handle_your_picture_change, gradio('your_picture', 'interface_state'), gradio('display'), show_progress=False) shared.gradio['send_instruction_to_default'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then( - partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-default')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') + chat.handle_send_instruction_click, gradio('interface_state'), gradio('textbox-default'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') shared.gradio['send_instruction_to_notebook'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then( - partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('textbox-notebook')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') + chat.handle_send_instruction_click, gradio('interface_state'), gradio('textbox-notebook'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') shared.gradio['send_instruction_to_negative_prompt'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: x.update({'mode': 'instruct', 'history': {'internal': [], 'visible': []}}), gradio('interface_state'), None).then( - partial(chat.generate_chat_prompt, 'Input'), gradio('interface_state'), gradio('negative_prompt')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}') + chat.handle_send_instruction_click, gradio('interface_state'), gradio('negative_prompt'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_generation_parameters()}}') shared.gradio['send-chat-to-default'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-default')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') + chat.handle_send_chat_click, gradio('interface_state'), gradio('textbox-default'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_default()}}') shared.gradio['send-chat-to-notebook'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - partial(chat.generate_chat_prompt, '', _continue=True), gradio('interface_state'), gradio('textbox-notebook')).then( - lambda: None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') + chat.handle_send_chat_click, gradio('interface_state'), gradio('textbox-notebook'), show_progress=False).then( + None, None, None, js=f'() => {{{ui.switch_tabs_js}; switch_to_notebook()}}') - shared.gradio['show_controls'].change(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') + shared.gradio['show_controls'].change(None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') diff --git a/modules/ui_default.py b/modules/ui_default.py old mode 100755 new mode 100644 index 1f9625516c02593a230bc45298751bac31430f2c..112acd23583e7cfaf8626c930ab179ac2a6e2b47 --- a/modules/ui_default.py +++ b/modules/ui_default.py @@ -16,7 +16,6 @@ outputs = ('output_textbox', 'html-default') def create_ui(): mu = shared.args.multi_user with gr.Tab('Default', elem_id='default-tab'): - shared.gradio['last_input-default'] = gr.State('') with gr.Row(): with gr.Column(): with gr.Row(): @@ -63,42 +62,48 @@ def create_ui(): def create_event_handlers(): shared.gradio['Generate-default'].click( - lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then( + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['textbox-default'].submit( - lambda x: x, gradio('textbox-default'), gradio('last_input-default')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then( + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') - shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False) shared.gradio['Continue-default'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, [shared.gradio['output_textbox']] + gradio(inputs)[1:], gradio(outputs), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + lambda state, left, right: state.update({'textbox-default': left, 'output_textbox': right}), gradio('interface_state', 'textbox-default', 'output_textbox'), None).then( + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['Stop-default'].click(stop_everything_event, None, None, queue=False) + shared.gradio['markdown_render-default'].click(lambda x: x, gradio('output_textbox'), gradio('markdown-default'), queue=False) shared.gradio['prompt_menu-default'].change(load_prompt, gradio('prompt_menu-default'), gradio('textbox-default'), show_progress=False) - shared.gradio['save_prompt-default'].click( - lambda x: x, gradio('textbox-default'), gradio('save_contents')).then( - lambda: 'prompts/', None, gradio('save_root')).then( - lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_saver')) - - shared.gradio['delete_prompt-default'].click( - lambda: 'prompts/', None, gradio('delete_root')).then( - lambda x: x + '.txt', gradio('prompt_menu-default'), gradio('delete_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_deleter')) - + shared.gradio['save_prompt-default'].click(handle_save_prompt, gradio('textbox-default'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False) + shared.gradio['delete_prompt-default'].click(handle_delete_prompt, gradio('prompt_menu-default'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False) shared.gradio['textbox-default'].change(lambda x: f"{count_tokens(x)}", gradio('textbox-default'), gradio('token-counter-default'), show_progress=False) shared.gradio['get_logits-default'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( logits.get_next_logits, gradio('textbox-default', 'interface_state', 'use_samplers-default', 'logits-default'), gradio('logits-default', 'logits-default-previous'), show_progress=False) shared.gradio['get_tokens-default'].click(get_token_ids, gradio('textbox-default'), gradio('tokens-default'), show_progress=False) + + +def handle_save_prompt(text): + return [ + text, + utils.current_time() + ".txt", + "prompts/", + gr.update(visible=True) + ] + + +def handle_delete_prompt(prompt): + return [ + prompt + ".txt", + "prompts/", + gr.update(visible=True) + ] diff --git a/modules/ui_file_saving.py b/modules/ui_file_saving.py old mode 100755 new mode 100644 index 7147121773eb091a3c0bf6ebe14905783cf67405..ac72c623162fc2fe13e439fc8be3c4fc5176f3c4 --- a/modules/ui_file_saving.py +++ b/modules/ui_file_saving.py @@ -1,3 +1,5 @@ +import traceback + import gradio as gr from modules import chat, presets, shared, ui, utils @@ -47,57 +49,119 @@ def create_ui(): def create_event_handlers(): - shared.gradio['save_confirm'].click( - lambda x, y, z: utils.save_file(x + y, z), gradio('save_root', 'save_filename', 'save_contents'), None).then( - lambda: gr.update(visible=False), None, gradio('file_saver')) - - shared.gradio['delete_confirm'].click( - lambda x, y: utils.delete_file(x + y), gradio('delete_root', 'delete_filename'), None).then( - lambda: gr.update(visible=False), None, gradio('file_deleter')) - - shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter')) - shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver')) - - shared.gradio['save_character_confirm'].click( - chat.save_character, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), None).then( - lambda: gr.update(visible=False), None, gradio('character_saver')).then( - lambda x: gr.update(choices=utils.get_available_characters(), value=x), gradio('save_character_filename'), gradio('character_menu')) - - shared.gradio['delete_character_confirm'].click( - lambda x: str(utils.get_available_characters().index(x)), gradio('character_menu'), gradio('temporary_text')).then( - chat.delete_character, gradio('character_menu'), None).then( - chat.update_character_menu_after_deletion, gradio('temporary_text'), gradio('character_menu')).then( - lambda: gr.update(visible=False), None, gradio('character_deleter')) - - shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver')) - shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter')) - shared.gradio['save_preset'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - presets.generate_preset_yaml, gradio('interface_state'), gradio('save_preset_contents')).then( - lambda: 'My Preset', None, gradio('save_preset_filename')).then( - lambda: gr.update(visible=True), None, gradio('preset_saver')) - - shared.gradio['save_preset_confirm'].click( - lambda x, y: utils.save_file(f'presets/{x}.yaml', y), gradio('save_preset_filename', 'save_preset_contents'), None).then( - lambda: gr.update(visible=False), None, gradio('preset_saver')).then( - lambda x: gr.update(choices=utils.get_available_presets(), value=x), gradio('save_preset_filename'), gradio('preset_menu')) + handle_save_preset_click, gradio('interface_state'), gradio('save_preset_contents', 'save_preset_filename', 'preset_saver'), show_progress=False) - shared.gradio['save_preset_cancel'].click(lambda: gr.update(visible=False), None, gradio('preset_saver')) + shared.gradio['delete_preset'].click(handle_delete_preset_click, gradio('preset_menu'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False) + shared.gradio['save_grammar'].click(handle_save_grammar_click, gradio('grammar_string'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False) + shared.gradio['delete_grammar'].click(handle_delete_grammar_click, gradio('grammar_file'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False) - shared.gradio['delete_preset'].click( - lambda x: f'{x}.yaml', gradio('preset_menu'), gradio('delete_filename')).then( - lambda: 'presets/', None, gradio('delete_root')).then( - lambda: gr.update(visible=True), None, gradio('file_deleter')) + shared.gradio['save_preset_confirm'].click(handle_save_preset_confirm_click, gradio('save_preset_filename', 'save_preset_contents'), gradio('preset_menu', 'preset_saver'), show_progress=False) + shared.gradio['save_confirm'].click(handle_save_confirm_click, gradio('save_root', 'save_filename', 'save_contents'), gradio('file_saver'), show_progress=False) + shared.gradio['delete_confirm'].click(handle_delete_confirm_click, gradio('delete_root', 'delete_filename'), gradio('file_deleter'), show_progress=False) + shared.gradio['save_character_confirm'].click(handle_save_character_confirm_click, gradio('name2', 'greeting', 'context', 'character_picture', 'save_character_filename'), gradio('character_menu', 'character_saver'), show_progress=False) + shared.gradio['delete_character_confirm'].click(handle_delete_character_confirm_click, gradio('character_menu'), gradio('character_menu', 'character_deleter'), show_progress=False) - shared.gradio['save_grammar'].click( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda x: x, gradio('grammar_string'), gradio('save_contents')).then( - lambda: 'grammars/', None, gradio('save_root')).then( - lambda: 'My Fancy Grammar.gbnf', None, gradio('save_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_saver')) - - shared.gradio['delete_grammar'].click( - lambda x: x, gradio('grammar_file'), gradio('delete_filename')).then( - lambda: 'grammars/', None, gradio('delete_root')).then( - lambda: gr.update(visible=True), None, gradio('file_deleter')) + shared.gradio['save_preset_cancel'].click(lambda: gr.update(visible=False), None, gradio('preset_saver'), show_progress=False) + shared.gradio['save_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_saver')) + shared.gradio['delete_cancel'].click(lambda: gr.update(visible=False), None, gradio('file_deleter')) + shared.gradio['save_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_saver'), show_progress=False) + shared.gradio['delete_character_cancel'].click(lambda: gr.update(visible=False), None, gradio('character_deleter'), show_progress=False) + + +def handle_save_preset_confirm_click(filename, contents): + try: + utils.save_file(f"presets/{filename}.yaml", contents) + available_presets = utils.get_available_presets() + output = gr.update(choices=available_presets, value=filename), + except Exception: + output = gr.update() + traceback.print_exc() + + return [ + output, + gr.update(visible=False) + ] + + +def handle_save_confirm_click(root, filename, contents): + try: + utils.save_file(root + filename, contents) + except Exception: + traceback.print_exc() + + return gr.update(visible=False) + + +def handle_delete_confirm_click(root, filename): + try: + utils.delete_file(root + filename) + except Exception: + traceback.print_exc() + + return gr.update(visible=False) + + +def handle_save_character_confirm_click(name2, greeting, context, character_picture, filename): + try: + chat.save_character(name2, greeting, context, character_picture, filename) + available_characters = utils.get_available_characters() + output = gr.update(choices=available_characters, value=filename) + except Exception: + output = gr.update() + traceback.print_exc() + + return [ + output, + gr.update(visible=False) + ] + + +def handle_delete_character_confirm_click(character): + try: + index = str(utils.get_available_characters().index(character)) + chat.delete_character(character) + output = chat.update_character_menu_after_deletion(index) + except Exception: + output = gr.update() + traceback.print_exc() + + return [ + output, + gr.update(visible=False) + ] + + +def handle_save_preset_click(state): + contents = presets.generate_preset_yaml(state) + return [ + contents, + "My Preset", + gr.update(visible=True) + ] + + +def handle_delete_preset_click(preset): + return [ + f"{preset}.yaml", + "presets/", + gr.update(visible=True) + ] + + +def handle_save_grammar_click(grammar_string): + return [ + grammar_string, + "My Fancy Grammar.gbnf", + "grammars/", + gr.update(visible=True) + ] + + +def handle_delete_grammar_click(grammar_file): + return [ + grammar_file, + "grammars/", + gr.update(visible=True) + ] diff --git a/modules/ui_model_menu.py b/modules/ui_model_menu.py old mode 100755 new mode 100644 index 7f7a3ab8e03b0bdfa68ca9e5581a18b69248a905..1883fdca4eea66bc4e4904f8b6300b08719f6ccc --- a/modules/ui_model_menu.py +++ b/modules/ui_model_menu.py @@ -66,7 +66,6 @@ def create_ui(): ui.create_refresh_button(shared.gradio['model_menu'], lambda: None, lambda: {'choices': utils.get_available_models()}, 'refresh-button', interactive=not mu) shared.gradio['load_model'] = gr.Button("Load", visible=not shared.settings['autoload_model'], elem_classes='refresh-button', interactive=not mu) shared.gradio['unload_model'] = gr.Button("Unload", elem_classes='refresh-button', interactive=not mu) - shared.gradio['reload_model'] = gr.Button("Reload", elem_classes='refresh-button', interactive=not mu) shared.gradio['save_model_settings'] = gr.Button("Save settings", elem_classes='refresh-button', interactive=not mu) with gr.Column(): @@ -94,32 +93,33 @@ def create_ui(): shared.gradio['hqq_backend'] = gr.Dropdown(label="hqq_backend", choices=["PYTORCH", "PYTORCH_COMPILE", "ATEN"], value=shared.args.hqq_backend) shared.gradio['n_gpu_layers'] = gr.Slider(label="n-gpu-layers", minimum=0, maximum=256, value=shared.args.n_gpu_layers, info='Must be set to more than 0 for your GPU to be used.') - shared.gradio['n_ctx'] = gr.Slider(minimum=0, maximum=shared.settings['truncation_length_max'], step=256, label="n_ctx", value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.') - shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 18,17') + shared.gradio['n_ctx'] = gr.Number(label="n_ctx", precision=0, step=256, value=shared.args.n_ctx, info='Context length. Try lowering this if you run out of memory while loading the model.') + shared.gradio['tensor_split'] = gr.Textbox(label='tensor_split', info='List of proportions to split the model across multiple GPUs. Example: 60,40') shared.gradio['n_batch'] = gr.Slider(label="n_batch", minimum=1, maximum=2048, step=1, value=shared.args.n_batch) - shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=32, value=shared.args.threads) - shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=32, value=shared.args.threads_batch) + shared.gradio['threads'] = gr.Slider(label="threads", minimum=0, step=1, maximum=256, value=shared.args.threads) + shared.gradio['threads_batch'] = gr.Slider(label="threads_batch", minimum=0, step=1, maximum=256, value=shared.args.threads_batch) shared.gradio['wbits'] = gr.Dropdown(label="wbits", choices=["None", 1, 2, 3, 4, 8], value=shared.args.wbits if shared.args.wbits > 0 else "None") shared.gradio['groupsize'] = gr.Dropdown(label="groupsize", choices=["None", 32, 64, 128, 1024], value=shared.args.groupsize if shared.args.groupsize > 0 else "None") - shared.gradio['model_type'] = gr.Dropdown(label="model_type", choices=["None"], value=shared.args.model_type or "None") - shared.gradio['pre_layer'] = gr.Slider(label="pre_layer", minimum=0, maximum=100, value=shared.args.pre_layer[0] if shared.args.pre_layer is not None else 0) shared.gradio['gpu_split'] = gr.Textbox(label='gpu-split', info='Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7') - shared.gradio['max_seq_len'] = gr.Slider(label='max_seq_len', minimum=0, maximum=shared.settings['truncation_length_max'], step=256, info='Context length. Try lowering this if you run out of memory while loading the model.', value=shared.args.max_seq_len) + shared.gradio['max_seq_len'] = gr.Number(label='max_seq_len', precision=0, step=256, value=shared.args.max_seq_len, info='Context length. Try lowering this if you run out of memory while loading the model.') with gr.Blocks(): - shared.gradio['alpha_value'] = gr.Slider(label='alpha_value', minimum=1, maximum=8, step=0.05, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.', value=shared.args.alpha_value) - shared.gradio['rope_freq_base'] = gr.Slider(label='rope_freq_base', minimum=0, maximum=1000000, step=1000, info='If greater than 0, will be used instead of alpha_value. Those two are related by rope_freq_base = 10000 * alpha_value ^ (64 / 63)', value=shared.args.rope_freq_base) - shared.gradio['compress_pos_emb'] = gr.Slider(label='compress_pos_emb', minimum=1, maximum=8, step=1, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.', value=shared.args.compress_pos_emb) + shared.gradio['alpha_value'] = gr.Number(label='alpha_value', value=shared.args.alpha_value, precision=2, info='Positional embeddings alpha factor for NTK RoPE scaling. Recommended values (NTKv1): 1.75 for 1.5x context, 2.5 for 2x context. Use either this or compress_pos_emb, not both.') + shared.gradio['rope_freq_base'] = gr.Number(label='rope_freq_base', value=shared.args.rope_freq_base, precision=0, info='Positional embeddings frequency base for NTK RoPE scaling. Related to alpha_value by rope_freq_base = 10000 * alpha_value ^ (64 / 63). 0 = from model.') + shared.gradio['compress_pos_emb'] = gr.Number(label='compress_pos_emb', value=shared.args.compress_pos_emb, precision=2, info='Positional embeddings compression factor. Should be set to (context length) / (model\'s original context length). Equal to 1/rope_freq_scale.') shared.gradio['autogptq_info'] = gr.Markdown('ExLlamav2_HF is recommended over AutoGPTQ for models derived from Llama.') - shared.gradio['quipsharp_info'] = gr.Markdown('QuIP# has to be installed manually at the moment.') with gr.Column(): shared.gradio['load_in_8bit'] = gr.Checkbox(label="load-in-8bit", value=shared.args.load_in_8bit) shared.gradio['load_in_4bit'] = gr.Checkbox(label="load-in-4bit", value=shared.args.load_in_4bit) shared.gradio['use_double_quant'] = gr.Checkbox(label="use_double_quant", value=shared.args.use_double_quant) shared.gradio['use_flash_attention_2'] = gr.Checkbox(label="use_flash_attention_2", value=shared.args.use_flash_attention_2, info='Set use_flash_attention_2=True while loading the model.') + shared.gradio['use_eager_attention'] = gr.Checkbox(label="use_eager_attention", value=shared.args.use_eager_attention, info='Set attn_implementation= eager while loading the model.') + shared.gradio['flash_attn'] = gr.Checkbox(label="flash_attn", value=shared.args.flash_attn, info='Use flash-attention.') shared.gradio['auto_devices'] = gr.Checkbox(label="auto-devices", value=shared.args.auto_devices) - shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This increases performance on RTX cards.') + shared.gradio['tensorcores'] = gr.Checkbox(label="tensorcores", value=shared.args.tensorcores, info='NVIDIA only: use llama-cpp-python compiled with tensor cores support. This may increase performance on newer cards.') + shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') + shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.') shared.gradio['streaming_llm'] = gr.Checkbox(label="streaming_llm", value=shared.args.streaming_llm, info='(experimental) Activate StreamingLLM to avoid re-evaluating the entire prompt when old messages are removed.') shared.gradio['attention_sink_size'] = gr.Number(label="attention_sink_size", value=shared.args.attention_sink_size, precision=0, info='StreamingLLM: number of sink tokens. Only used if the trimmed prompt doesn\'t share a prefix with the old prompt.') shared.gradio['cpu'] = gr.Checkbox(label="cpu", value=shared.args.cpu, info='llama.cpp: Use llama-cpp-python compiled without GPU acceleration. Transformers: use PyTorch in CPU mode.') @@ -127,7 +127,6 @@ def create_ui(): shared.gradio['no_offload_kqv'] = gr.Checkbox(label="no_offload_kqv", value=shared.args.no_offload_kqv, info='Do not offload the K, Q, V to the GPU. This saves VRAM but reduces the performance.') shared.gradio['no_mul_mat_q'] = gr.Checkbox(label="no_mul_mat_q", value=shared.args.no_mul_mat_q, info='Disable the mulmat kernels.') shared.gradio['triton'] = gr.Checkbox(label="triton", value=shared.args.triton) - shared.gradio['no_inject_fused_attention'] = gr.Checkbox(label="no_inject_fused_attention", value=shared.args.no_inject_fused_attention, info='Disable fused attention. Fused attention improves inference performance but uses more VRAM. Fuses layers for AutoAWQ. Disable if running low on VRAM.') shared.gradio['no_inject_fused_mlp'] = gr.Checkbox(label="no_inject_fused_mlp", value=shared.args.no_inject_fused_mlp, info='Affects Triton only. Disable fused MLP. Fused MLP improves performance but uses more VRAM. Disable if running low on VRAM.') shared.gradio['no_use_cuda_fp16'] = gr.Checkbox(label="no_use_cuda_fp16", value=shared.args.no_use_cuda_fp16, info='This can make models faster on some systems.') shared.gradio['desc_act'] = gr.Checkbox(label="desc_act", value=shared.args.desc_act, info='\'desc_act\', \'wbits\', and \'groupsize\' are used for old models without a quantize_config.json.') @@ -136,11 +135,12 @@ def create_ui(): shared.gradio['numa'] = gr.Checkbox(label="numa", value=shared.args.numa, info='NUMA support can help on some systems with non-uniform memory access.') shared.gradio['disk'] = gr.Checkbox(label="disk", value=shared.args.disk) shared.gradio['bf16'] = gr.Checkbox(label="bf16", value=shared.args.bf16) - shared.gradio['cache_8bit'] = gr.Checkbox(label="cache_8bit", value=shared.args.cache_8bit, info='Use 8-bit cache to save VRAM.') - shared.gradio['cache_4bit'] = gr.Checkbox(label="cache_4bit", value=shared.args.cache_4bit, info='Use Q4 cache to save VRAM.') shared.gradio['autosplit'] = gr.Checkbox(label="autosplit", value=shared.args.autosplit, info='Automatically split the model tensors across the available GPUs.') - shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn, info='Force flash-attention to not be used.') + shared.gradio['no_flash_attn'] = gr.Checkbox(label="no_flash_attn", value=shared.args.no_flash_attn) + shared.gradio['no_xformers'] = gr.Checkbox(label="no_xformers", value=shared.args.no_xformers) + shared.gradio['no_sdpa'] = gr.Checkbox(label="no_sdpa", value=shared.args.no_sdpa) shared.gradio['cfg_cache'] = gr.Checkbox(label="cfg-cache", value=shared.args.cfg_cache, info='Necessary to use CFG with this loader.') + shared.gradio['cpp_runner'] = gr.Checkbox(label="cpp-runner", value=shared.args.cpp_runner, info='Enable inference with ModelRunnerCpp, which is faster than the default ModelRunner.') shared.gradio['num_experts_per_token'] = gr.Number(label="Number of experts per token", value=shared.args.num_experts_per_token, info='Only applies to MoE models like Mixtral.') with gr.Blocks(): shared.gradio['trust_remote_code'] = gr.Checkbox(label="trust-remote-code", value=shared.args.trust_remote_code, info='Set trust_remote_code=True while loading the tokenizer/model. To enable this option, start the web UI with the --trust-remote-code flag.', interactive=shared.args.trust_remote_code) @@ -149,9 +149,9 @@ def create_ui(): shared.gradio['disable_exllama'] = gr.Checkbox(label="disable_exllama", value=shared.args.disable_exllama, info='Disable ExLlama kernel for GPTQ models.') shared.gradio['disable_exllamav2'] = gr.Checkbox(label="disable_exllamav2", value=shared.args.disable_exllamav2, info='Disable ExLlamav2 kernel for GPTQ models.') - shared.gradio['gptq_for_llama_info'] = gr.Markdown('Legacy loader for compatibility with older GPUs. ExLlamav2_HF or AutoGPTQ are preferred for GPTQ models when supported.') shared.gradio['exllamav2_info'] = gr.Markdown("ExLlamav2_HF is recommended over ExLlamav2 for better integration with extensions and more consistent sampling behavior across loaders.") shared.gradio['llamacpp_HF_info'] = gr.Markdown("llamacpp_HF loads llama.cpp as a Transformers model. To use it, you need to place your GGUF in a subfolder of models/ with the necessary tokenizer files.\n\nYou can use the \"llamacpp_HF creator\" menu to do that automatically.") + shared.gradio['tensorrt_llm_info'] = gr.Markdown('* TensorRT-LLM has to be installed manually in a separate Python 3.10 environment at the moment. For a guide, consult the description of [this PR](https://github.com/oobabooga/text-generation-webui/pull/5715). \n\n* `max_seq_len` is only used when `cpp-runner` is checked.\n\n* `cpp_runner` does not support streaming at the moment.') with gr.Column(): with gr.Row(): @@ -186,41 +186,24 @@ def create_ui(): def create_event_handlers(): - shared.gradio['loader'].change( - loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params())).then( - lambda value: gr.update(choices=loaders.get_model_types(value)), gradio('loader'), gradio('model_type')) + shared.gradio['loader'].change(loaders.make_loader_params_visible, gradio('loader'), gradio(loaders.get_all_params()), show_progress=False) # In this event handler, the interface state is read and updated # with the model defaults (if any), and then the model is loaded # unless "autoload_model" is unchecked shared.gradio['model_menu'].change( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - apply_model_settings_to_state, gradio('model_menu', 'interface_state'), gradio('interface_state')).then( - ui.apply_interface_values, gradio('interface_state'), gradio(ui.list_interface_input_elements()), show_progress=False).then( - update_model_parameters, gradio('interface_state'), None).then( + handle_load_model_event_initial, gradio('model_menu', 'interface_state'), gradio(ui.list_interface_input_elements()) + gradio('interface_state'), show_progress=False).then( load_model_wrapper, gradio('model_menu', 'loader', 'autoload_model'), gradio('model_status'), show_progress=False).success( - update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length')).then( - lambda x: x, gradio('loader'), gradio('filter_by_loader')) + handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) shared.gradio['load_model'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( update_model_parameters, gradio('interface_state'), None).then( partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success( - update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length')).then( - lambda x: x, gradio('loader'), gradio('filter_by_loader')) - - shared.gradio['reload_model'].click( - unload_model, None, None).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - update_model_parameters, gradio('interface_state'), None).then( - partial(load_model_wrapper, autoload=True), gradio('model_menu', 'loader'), gradio('model_status'), show_progress=False).success( - update_truncation_length, gradio('truncation_length', 'interface_state'), gradio('truncation_length')).then( - lambda x: x, gradio('loader'), gradio('filter_by_loader')) - - shared.gradio['unload_model'].click( - unload_model, None, None).then( - lambda: "Model unloaded", None, gradio('model_status')) + handle_load_model_event_final, gradio('truncation_length', 'loader', 'interface_state'), gradio('truncation_length', 'filter_by_loader'), show_progress=False) + shared.gradio['unload_model'].click(handle_unload_model_click, None, gradio('model_status'), show_progress=False) shared.gradio['save_model_settings'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( save_model_settings, gradio('model_menu', 'interface_state'), gradio('model_status'), show_progress=False) @@ -272,6 +255,10 @@ def load_lora_wrapper(selected_loras): def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), return_links=False, check=False): try: + if repo_id == "": + yield ("Please enter a model path") + return + downloader = importlib.import_module("download-model").ModelDownloader() progress(0.0) @@ -289,7 +276,13 @@ def download_model_wrapper(repo_id, specific_file, progress=gr.Progress(), retur return yield ("Getting the output folder") - output_folder = downloader.get_output_folder(model, branch, is_lora, is_llamacpp=is_llamacpp) + output_folder = downloader.get_output_folder( + model, + branch, + is_lora, + is_llamacpp=is_llamacpp, + model_dir=shared.args.model_dir if shared.args.model_dir != shared.args_defaults.model_dir else None + ) if output_folder == Path("models"): output_folder = Path(shared.args.model_dir) @@ -343,3 +336,20 @@ def update_truncation_length(current_length, state): return state['n_ctx'] return current_length + + +def handle_load_model_event_initial(model, state): + state = apply_model_settings_to_state(model, state) + output = ui.apply_interface_values(state) + update_model_parameters(state) + return output + [state] + + +def handle_load_model_event_final(truncation_length, loader, state): + truncation_length = update_truncation_length(truncation_length, state) + return [truncation_length, loader] + + +def handle_unload_model_click(): + unload_model() + return "Model unloaded" diff --git a/modules/ui_notebook.py b/modules/ui_notebook.py old mode 100755 new mode 100644 index a7c62baf5e5dc64182f2554ff98b4d3180de3824..799328447c3c08d6121354dcd68ce268b3094964 --- a/modules/ui_notebook.py +++ b/modules/ui_notebook.py @@ -7,6 +7,7 @@ from modules.text_generation import ( get_token_ids, stop_everything_event ) +from modules.ui_default import handle_delete_prompt, handle_save_prompt from modules.utils import gradio inputs = ('textbox-notebook', 'interface_state') @@ -66,38 +67,32 @@ def create_event_handlers(): lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then( + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') shared.gradio['textbox-notebook'].submit( lambda x: x, gradio('textbox-notebook'), gradio('last_input-notebook')).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then( + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') - shared.gradio['Undo'].click(lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False) - shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False) shared.gradio['Regenerate-notebook'].click( lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( generate_reply_wrapper, gradio(inputs), gradio(outputs), show_progress=False).then( - ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - lambda: None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None).then( + None, None, None, js=f'() => {{{ui.audio_notification_js}}}') + shared.gradio['Undo'].click( + lambda x: x, gradio('last_input-notebook'), gradio('textbox-notebook'), show_progress=False).then( + lambda state, text: state.update({'textbox-notebook': text}), gradio('interface_state', 'textbox-notebook'), None) + + shared.gradio['markdown_render-notebook'].click(lambda x: x, gradio('textbox-notebook'), gradio('markdown-notebook'), queue=False) shared.gradio['Stop-notebook'].click(stop_everything_event, None, None, queue=False) shared.gradio['prompt_menu-notebook'].change(load_prompt, gradio('prompt_menu-notebook'), gradio('textbox-notebook'), show_progress=False) - shared.gradio['save_prompt-notebook'].click( - lambda x: x, gradio('textbox-notebook'), gradio('save_contents')).then( - lambda: 'prompts/', None, gradio('save_root')).then( - lambda: utils.current_time() + '.txt', None, gradio('save_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_saver')) - - shared.gradio['delete_prompt-notebook'].click( - lambda: 'prompts/', None, gradio('delete_root')).then( - lambda x: x + '.txt', gradio('prompt_menu-notebook'), gradio('delete_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_deleter')) - + shared.gradio['save_prompt-notebook'].click(handle_save_prompt, gradio('textbox-notebook'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False) + shared.gradio['delete_prompt-notebook'].click(handle_delete_prompt, gradio('prompt_menu-notebook'), gradio('delete_filename', 'delete_root', 'file_deleter'), show_progress=False) shared.gradio['textbox-notebook'].input(lambda x: f"{count_tokens(x)}", gradio('textbox-notebook'), gradio('token-counter-notebook'), show_progress=False) shared.gradio['get_logits-notebook'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( diff --git a/modules/ui_parameters.py b/modules/ui_parameters.py old mode 100755 new mode 100644 index 0414cdde0466cd8ef833f04b5b0479a1e71ed1e4..eff62c20e5da1a91766b595c64eb1275c9a0aea5 --- a/modules/ui_parameters.py +++ b/modules/ui_parameters.py @@ -38,6 +38,13 @@ def create_ui(default_preset): shared.gradio['presence_penalty'] = gr.Slider(0, 2, value=generate_params['presence_penalty'], step=0.05, label='presence_penalty') shared.gradio['repetition_penalty_range'] = gr.Slider(0, 4096, step=64, value=generate_params['repetition_penalty_range'], label='repetition_penalty_range') shared.gradio['do_sample'] = gr.Checkbox(value=generate_params['do_sample'], label='do_sample') + + with gr.Blocks(): + shared.gradio['dry_multiplier'] = gr.Slider(0, 5, value=generate_params['dry_multiplier'], step=0.01, label='dry_multiplier', info='Set to greater than 0 to enable DRY. Recommended value: 0.8.') + shared.gradio['dry_allowed_length'] = gr.Slider(1, 20, value=generate_params['dry_allowed_length'], step=1, label='dry_allowed_length', info='Longest sequence that can be repeated without being penalized.') + shared.gradio['dry_base'] = gr.Slider(1, 4, value=generate_params['dry_base'], step=0.01, label='dry_base', info='Controls how fast the penalty grows with increasing sequence length.') + shared.gradio['dry_sequence_breakers'] = gr.Textbox(value=generate_params['dry_sequence_breakers'], label='dry_sequence_breakers', info='Tokens across which sequence matching is not continued. Specified as a comma-separated list of quoted strings.') + gr.Markdown("[Learn more](https://github.com/oobabooga/text-generation-webui/wiki/03-%E2%80%90-Parameters-Tab)") with gr.Column(): @@ -82,7 +89,7 @@ def create_ui(default_preset): shared.gradio['sampler_priority'] = gr.Textbox(value=generate_params['sampler_priority'], lines=12, label='Sampler priority', info='Parameter names separated by new lines or commas.') with gr.Column(): - shared.gradio['truncation_length'] = gr.Slider(value=get_truncation_length(), minimum=shared.settings['truncation_length_min'], maximum=shared.settings['truncation_length_max'], step=256, label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.') + shared.gradio['truncation_length'] = gr.Number(precision=0, step=256, value=get_truncation_length(), label='Truncate the prompt up to this length', info='The leftmost tokens are removed if the prompt exceeds this length. Most models require this to be at most 2048.') shared.gradio['prompt_lookup_num_tokens'] = gr.Slider(value=shared.settings['prompt_lookup_num_tokens'], minimum=0, maximum=10, step=1, label='prompt_lookup_num_tokens', info='Activates Prompt Lookup Decoding.') shared.gradio['max_tokens_second'] = gr.Slider(value=shared.settings['max_tokens_second'], minimum=0, maximum=20, step=1, label='Maximum tokens/second', info='To make text readable in real time.') shared.gradio['max_updates_second'] = gr.Slider(value=shared.settings['max_updates_second'], minimum=0, maximum=24, step=1, label='Maximum UI updates/second', info='Set this if you experience lag in the UI during streaming.') @@ -95,10 +102,16 @@ def create_ui(default_preset): def create_event_handlers(): shared.gradio['filter_by_loader'].change(loaders.blacklist_samplers, gradio('filter_by_loader', 'dynamic_temperature'), gradio(loaders.list_all_samplers()), show_progress=False) - shared.gradio['preset_menu'].change(presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params())) - shared.gradio['random_preset'].click(presets.random_preset, gradio('interface_state'), gradio('interface_state') + gradio(presets.presets_params())) - shared.gradio['grammar_file'].change(load_grammar, gradio('grammar_file'), gradio('grammar_string')) - shared.gradio['dynamic_temperature'].change(lambda x: [gr.update(visible=x)] * 3, gradio('dynamic_temperature'), gradio('dynatemp_low', 'dynatemp_high', 'dynatemp_exponent')) + shared.gradio['preset_menu'].change( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + presets.load_preset_for_ui, gradio('preset_menu', 'interface_state'), gradio('interface_state') + gradio(presets.presets_params()), show_progress=False) + + shared.gradio['random_preset'].click( + ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( + presets.random_preset, gradio('interface_state'), gradio('interface_state') + gradio(presets.presets_params()), show_progress=False) + + shared.gradio['grammar_file'].change(load_grammar, gradio('grammar_file'), gradio('grammar_string'), show_progress=False) + shared.gradio['dynamic_temperature'].change(lambda x: [gr.update(visible=x)] * 3, gradio('dynamic_temperature'), gradio('dynatemp_low', 'dynatemp_high', 'dynatemp_exponent'), show_progress=False) def get_truncation_length(): diff --git a/modules/ui_session.py b/modules/ui_session.py old mode 100755 new mode 100644 index 08929c33865bc39b416f0d0e1615fe331c47d737..dfb95b83d5f438293b4ad19daedc3fcd485e9248 --- a/modules/ui_session.py +++ b/modules/ui_session.py @@ -32,18 +32,25 @@ def create_ui(): # Reset interface event shared.gradio['reset_interface'].click( set_interface_arguments, gradio('extensions_menu', 'bool_menu'), None).then( - lambda: None, None, None, js='() => {document.body.innerHTML=\'Reloading...
\'; setTimeout(function(){location.reload()},2500); return []}') + None, None, None, js='() => {document.body.innerHTML=\'Reloading...
\'; setTimeout(function(){location.reload()},2500); return []}') shared.gradio['toggle_dark_mode'].click( - lambda: None, None, None, js='() => {document.getElementsByTagName("body")[0].classList.toggle("dark")}').then( - lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state')) + lambda x: 'dark' if x == 'light' else 'light', gradio('theme_state'), gradio('theme_state')).then( + None, None, None, js=f'() => {{{ui.dark_theme_js}; toggleDarkMode()}}') shared.gradio['save_settings'].click( ui.gather_interface_values, gradio(shared.input_elements), gradio('interface_state')).then( - ui.save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents')).then( - lambda: './', None, gradio('save_root')).then( - lambda: 'settings.yaml', None, gradio('save_filename')).then( - lambda: gr.update(visible=True), None, gradio('file_saver')) + handle_save_settings, gradio('interface_state', 'preset_menu', 'extensions_menu', 'show_controls', 'theme_state'), gradio('save_contents', 'save_filename', 'save_root', 'file_saver'), show_progress=False) + + +def handle_save_settings(state, preset, extensions, show_controls, theme): + contents = ui.save_settings(state, preset, extensions, show_controls, theme) + return [ + contents, + "settings.yaml", + "./", + gr.update(visible=True) + ] def set_interface_arguments(extensions, bool_active): diff --git a/modules/utils.py b/modules/utils.py old mode 100755 new mode 100644 index 4b65736b6b838fff4fbaf27fdf2045d81ed351b6..f4333031fe9946bf871a33415d47da115978bd62 --- a/modules/utils.py +++ b/modules/utils.py @@ -95,11 +95,10 @@ def get_available_presets(): def get_available_prompts(): - prompts = [] - files = set((k.stem for k in Path('prompts').glob('*.txt'))) - prompts += sorted([k for k in files if re.match('^[0-9]', k)], key=natural_keys, reverse=True) - prompts += sorted([k for k in files if re.match('^[^0-9]', k)], key=natural_keys) - prompts += ['None'] + prompt_files = list(Path('prompts').glob('*.txt')) + sorted_files = sorted(prompt_files, key=lambda x: x.stat().st_mtime, reverse=True) + prompts = [file.stem for file in sorted_files] + prompts.append('None') return prompts diff --git a/one_click.py b/one_click.py old mode 100755 new mode 100644 index 0d543e303fbcfeb3ea03df0219c4a210fdc1c23b..0a0412ba07d36abe9869283b916fbbd808e6bcf3 --- a/one_click.py +++ b/one_click.py @@ -16,9 +16,9 @@ import sys # Define the required PyTorch version -TORCH_VERSION = "2.2.1" -TORCHVISION_VERSION = "0.17.1" -TORCHAUDIO_VERSION = "2.2.1" +TORCH_VERSION = "2.2.2" +TORCHVISION_VERSION = "0.17.2" +TORCHAUDIO_VERSION = "2.2.2" # Environment script_dir = os.getcwd() @@ -315,7 +315,7 @@ def install_webui(): run_cmd("conda install -y libuv") # Install the webui requirements - update_requirements(initial_installation=True) + update_requirements(initial_installation=True, pull=False) def get_extensions_names(): @@ -388,7 +388,12 @@ def update_requirements(initial_installation=False, pull=True): # Prepare the requirements file textgen_requirements = open(requirements_file).read().splitlines() if is_cuda118: - textgen_requirements = [req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') for req in textgen_requirements] + textgen_requirements = [ + req.replace('+cu121', '+cu118').replace('+cu122', '+cu118') + for req in textgen_requirements + if "auto-gptq" not in req.lower() and "autoawq" not in req.lower() + ] + if is_windows() and is_cuda118: # No flash-attention on Windows for CUDA 11 textgen_requirements = [req for req in textgen_requirements if 'oobabooga/flash-attention' not in req] diff --git a/presets/Big O.yaml b/presets/Big O.yaml old mode 100755 new mode 100644 diff --git a/presets/Contrastive Search.yaml b/presets/Contrastive Search.yaml old mode 100755 new mode 100644 diff --git a/presets/Debug-deterministic.yaml b/presets/Debug-deterministic.yaml old mode 100755 new mode 100644 diff --git a/presets/Divine Intellect.yaml b/presets/Divine Intellect.yaml old mode 100755 new mode 100644 diff --git a/presets/LLaMA-Precise.yaml b/presets/LLaMA-Precise.yaml old mode 100755 new mode 100644 diff --git a/presets/Midnight Enigma.yaml b/presets/Midnight Enigma.yaml old mode 100755 new mode 100644 diff --git a/presets/Null preset.yaml b/presets/Null preset.yaml old mode 100755 new mode 100644 diff --git a/presets/Shortwave.yaml b/presets/Shortwave.yaml old mode 100755 new mode 100644 diff --git a/presets/Yara.yaml b/presets/Yara.yaml old mode 100755 new mode 100644 diff --git a/presets/min_p.yaml b/presets/min_p.yaml old mode 100755 new mode 100644 diff --git a/presets/simple-1.yaml b/presets/simple-1.yaml old mode 100755 new mode 100644 diff --git a/prompts/Alpaca-with-Input.txt b/prompts/Alpaca-with-Input.txt old mode 100755 new mode 100644 diff --git a/prompts/QA.txt b/prompts/QA.txt old mode 100755 new mode 100644 diff --git a/requirements.txt b/requirements.txt old mode 100755 new mode 100644 index 4da825ac00525cce74d613da6b73b8d59f0f3c4e..51558a845a21db40d973d7d1dbba3a84806bcc31 --- a/requirements.txt +++ b/requirements.txt @@ -1,23 +1,24 @@ -#accelerate==0.27.* -accelerate -aqlm[gpu,cpu]==1.1.3; platform_system == "Linux" -#bitsandbytes==0.43.* -bitsandbytes +accelerate==0.33.* +aqlm[gpu,cpu]==1.1.6; platform_system == "Linux" +auto-gptq==0.7.1 +bitsandbytes==0.43.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -25,7 +26,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.39.* +transformers==4.44.* tqdm wandb @@ -36,39 +37,38 @@ sse-starlette==1.6.5 tiktoken # llama-cpp-python (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.61+cpuavx2-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.61+cpuavx2-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.61+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.61+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # llama-cpp-python (CUDA, no tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.61+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.61+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.61+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.61+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # llama-cpp-python (CUDA, tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.61+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.61+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.61+cu121-cp311-cp311-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.61+cu121-cp310-cp310-manylinux_2_31_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # CUDA wheels -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.18/exllamav2-0.0.18-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_amd.txt b/requirements_amd.txt old mode 100755 new mode 100644 index 1ab80f7624c65b7556eb17c3cf19f8dd893a237d..e9dfb1283ea21a4177650d87ebe750669e7eaa53 --- a/requirements_amd.txt +++ b/requirements_amd.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -32,20 +34,18 @@ sse-starlette==1.6.5 tiktoken # llama-cpp-python (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # AMD wheels -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.65+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.65+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.89+rocm5.6.1-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/rocm/llama_cpp_python_cuda-0.2.89+rocm5.6.1-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt old mode 100755 new mode 100644 index 5d86bf09952e6add0df8d26f1843e7fee9419680..69a350ad54fc841fb28bc1246a7762e3bccee743 --- a/requirements_amd_noavx2.txt +++ b/requirements_amd_noavx2.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -32,18 +34,16 @@ sse-starlette==1.6.5 tiktoken # llama-cpp-python (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # AMD wheels -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+rocm5.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+rocm5.6.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system != "Darwin" and platform_machine != "x86_64" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7+rocm561-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt old mode 100755 new mode 100644 index 80dcff015d69247aaca251ea7eba9ae501d4db89..0326916eeee5967911dd2836e5de490039b12e8a --- a/requirements_apple_intel.txt +++ b/requirements_apple_intel.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -32,10 +34,8 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_11_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_12_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_14_0_x86_64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt old mode 100755 new mode 100644 index e8c9583c75bbb08089e30c0c5d75870b77015c4b..70ee3d27a1f3f7c4f8bdd797e4a0722f3fba97f8 --- a/requirements_apple_silicon.txt +++ b/requirements_apple_silicon.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -32,12 +34,10 @@ sse-starlette==1.6.5 tiktoken # Mac wheels -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_11_0_arm64.whl; platform_system == "Darwin" and platform_release >= "20.0.0" and platform_release < "21.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.65-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_12_0_arm64.whl; platform_system == "Darwin" and platform_release >= "21.0.0" and platform_release < "22.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_13_0_arm64.whl; platform_system == "Darwin" and platform_release >= "22.0.0" and platform_release < "23.0.0" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp311-cp311-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/metal/llama_cpp_python-0.2.89-cp310-cp310-macosx_14_0_arm64.whl; platform_system == "Darwin" and platform_release >= "23.0.0" and platform_release < "24.0.0" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt old mode 100755 new mode 100644 index 56a27ba4aa7ce853275d3854bdc94986abd11af1..fe2d796ccdd1d2276a9ca87917bef7e053c30edd --- a/requirements_cpu_only.txt +++ b/requirements_cpu_only.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -32,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # llama-cpp-python (CPU only, AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt old mode 100755 new mode 100644 index 3b97c75e4fa6c6e823ae891d01b8d16b3b5cd0d0..982a12a162c326901e86d3f5255de0392f45f9bc --- a/requirements_cpu_only_noavx2.txt +++ b/requirements_cpu_only_noavx2.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -32,7 +34,7 @@ sse-starlette==1.6.5 tiktoken # llama-cpp-python (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt old mode 100755 new mode 100644 index 435e8cf08f600aa73d3c05542b1a1e4f6a14db29..fc253f5c2716f02b95fc24c54c07f3c74dcbf61d --- a/requirements_noavx2.txt +++ b/requirements_noavx2.txt @@ -1,21 +1,24 @@ -accelerate==0.27.* -aqlm[gpu,cpu]==1.1.3; platform_system == "Linux" +accelerate==0.33.* +aqlm[gpu,cpu]==1.1.6; platform_system == "Linux" +auto-gptq==0.7.1 bitsandbytes==0.43.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -23,7 +26,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb @@ -34,39 +37,38 @@ sse-starlette==1.6.5 tiktoken # llama-cpp-python (CPU only, no AVX2) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.65+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/cpu/llama_cpp_python-0.2.89+cpuavx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" # llama-cpp-python (CUDA, no tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.65+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.65+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.65+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.65+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda-0.2.89+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # llama-cpp-python (CUDA, tensor cores) -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.65+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.65+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.65+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.65+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/llama-cpp-python-cuBLAS-wheels/releases/download/textgen-webui/llama_cpp_python_cuda_tensorcores-0.2.89+cu121avx-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" # CUDA wheels -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/AutoGPTQ/releases/download/v0.6.0/auto_gptq-0.6.0+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/turboderp/exllamav2/releases/download/v0.0.19/exllamav2-0.0.19-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/oobabooga/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2.0cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu122torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" -https://github.com/jllllll/GPTQ-for-LLaMa-CUDA/releases/download/0.1.1/gptq_for_llama-0.1.1+cu121-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" -autoawq==0.2.3; platform_system == "Linux" or platform_system == "Windows" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8+cu121.torch2.2.2-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/exllamav2/releases/download/v0.1.8/exllamav2-0.1.8-py3-none-any.whl; platform_system == "Linux" and platform_machine != "x86_64" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu122torch2.2.2cxx11abiFALSE-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.1/flash_attn-2.6.1+cu123torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ/releases/download/0.2.6/autoawq-0.2.6-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-win_amd64.whl; platform_system == "Windows" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-win_amd64.whl; platform_system == "Windows" and python_version == "3.10" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp311-cp311-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.11" +https://github.com/oobabooga/AutoAWQ_kernels/releases/download/0.0.7/autoawq_kernels-0.0.7-cp310-cp310-linux_x86_64.whl; platform_system == "Linux" and platform_machine == "x86_64" and python_version == "3.10" diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt old mode 100755 new mode 100644 index 8da69bd2c0765430e38fa79415634cd433507ed0..ec18464d0fcfe45e095a22b5b3277f72fdb90190 --- a/requirements_nowheels.txt +++ b/requirements_nowheels.txt @@ -1,19 +1,21 @@ -accelerate==0.27.* +accelerate==0.33.* colorama datasets einops +fastapi==0.112.4 gradio==4.26.* -hqq==0.1.5 -jinja2==3.1.2 +hqq==0.1.7.post3 +jinja2==3.1.4 lm_eval==0.3.0 markdown numba==0.59.* numpy==1.26.* optimum==1.17.* pandas -peft==0.8.* +peft==0.12.* Pillow>=9.5.0 psutil +pydantic==2.8.2 pyyaml requests rich @@ -21,7 +23,7 @@ safetensors==0.4.* scipy sentencepiece tensorboard -transformers==4.40.* +transformers==4.44.* tqdm wandb diff --git a/server.py b/server.py old mode 100755 new mode 100644 index 38e033e1a2bbf3571d913c87df588604a08ba0cb..d6069d5e343562d373f17076547109adfb9fcb5c --- a/server.py +++ b/server.py @@ -1,6 +1,5 @@ import os import warnings -import spaces from modules import shared @@ -33,7 +32,7 @@ import sys import time from functools import partial from pathlib import Path -from threading import Lock +from threading import Lock, Thread import yaml @@ -53,7 +52,7 @@ from modules import ( ) from modules.extensions import apply_extensions from modules.LoRA import add_lora_to_model -from modules.models import load_model +from modules.models import load_model, unload_model_if_idle from modules.models_settings import ( get_fallback_settings, get_model_metadata, @@ -70,6 +69,7 @@ def signal_handler(sig, frame): signal.signal(signal.SIGINT, signal_handler) + def create_interface(): title = 'Text generation web UI' @@ -90,7 +90,7 @@ def create_interface(): # Force some events to be triggered on page load shared.persistent_interface_state.update({ 'loader': shared.args.loader or 'Transformers', - 'mode': shared.settings['mode'], + 'mode': shared.settings['mode'] if shared.settings['mode'] == 'instruct' else gr.update(), 'character_menu': shared.args.character or shared.settings['character'], 'instruction_template_str': shared.settings['instruction_template_str'], 'prompt_menu-default': shared.settings['prompt-default'], @@ -146,11 +146,21 @@ def create_interface(): ui_model_menu.create_event_handlers() # Interface launch events - shared.gradio['interface'].load(lambda: None, None, None, js=f"() => {{if ({str(shared.settings['dark_theme']).lower()}) {{ document.getElementsByTagName('body')[0].classList.add('dark'); }} }}") - shared.gradio['interface'].load(lambda: None, None, None, js=f"() => {{{js}}}") - shared.gradio['interface'].load(lambda x: None, gradio('show_controls'), None, js=f'(x) => {{{ui.show_controls_js}; toggle_controls(x)}}') + shared.gradio['interface'].load( + None, + gradio('show_controls'), + None, + js=f"""(x) => {{ + if ({str(shared.settings['dark_theme']).lower()}) {{ + document.getElementsByTagName('body')[0].classList.add('dark'); + }} + {js} + {ui.show_controls_js} + toggle_controls(x); + }}""" + ) + shared.gradio['interface'].load(partial(ui.apply_interface_values, {}, use_persistent=True), None, gradio(ui.list_interface_input_elements()), show_progress=False) - shared.gradio['interface'].load(chat.redraw_html, gradio(ui_chat.reload_arr), gradio('display')) extensions_module.create_extensions_tabs() # Extensions tabs extensions_module.create_extensions_block() # Extensions block @@ -169,9 +179,11 @@ def create_interface(): ssl_verify=False if (shared.args.ssl_keyfile or shared.args.ssl_certfile) else True, ssl_keyfile=shared.args.ssl_keyfile, ssl_certfile=shared.args.ssl_certfile, + root_path=shared.args.subpath, allowed_paths=["cache", "css", "extensions", "js"] ) + if __name__ == "__main__": logger.info("Starting Text generation web UI") @@ -244,6 +256,11 @@ if __name__ == "__main__": shared.generation_lock = Lock() + if shared.args.idle_timeout > 0: + timer_thread = Thread(target=unload_model_if_idle) + timer_thread.daemon = True + timer_thread.start() + if shared.args.nowebui: # Start the API in standalone mode shared.args.extensions = [x for x in shared.args.extensions if x != 'gallery'] diff --git a/settings-template.yaml b/settings-template.yaml old mode 100755 new mode 100644 index f09c845e15fd18a4aacd41331515b02b8ac02503..59c76c350b8aaba3d3b9b038ce2d4bb934e348d9 --- a/settings-template.yaml +++ b/settings-template.yaml @@ -1,7 +1,7 @@ dark_theme: true show_controls: true start_with: '' -mode: chat +mode: chat-instruct chat_style: cai-chat prompt-default: QA prompt-notebook: QA @@ -12,8 +12,6 @@ max_new_tokens_max: 4096 negative_prompt: '' seed: -1 truncation_length: 2048 -truncation_length_min: 0 -truncation_length_max: 200000 max_tokens_second: 0 max_updates_second: 0 prompt_lookup_num_tokens: 0 diff --git a/setup.cfg b/setup.cfg old mode 100755 new mode 100644 diff --git a/start_linux.sh b/start_linux.sh index 5620c831387a53e1d69a783b78b50b91f9353049..792daca840a0a430d17ace18b92c033a27376054 100755 --- a/start_linux.sh +++ b/start_linux.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash cd "$(dirname "${BASH_SOURCE[0]}")" diff --git a/training/datasets/put-trainer-datasets-here.txt b/training/datasets/put-trainer-datasets-here.txt old mode 100755 new mode 100644 diff --git a/training/formats/ChatML-format.json b/training/formats/ChatML-format.json new file mode 100644 index 0000000000000000000000000000000000000000..a9f8a09afa1c6e8b6520b82722cb4c3e84ac79e8 --- /dev/null +++ b/training/formats/ChatML-format.json @@ -0,0 +1,4 @@ +{ + "instruction,output": "<|im_start|>system\n<|im_end|>\n<|im_start|>user\n%instruction%<|im_end|>\n<|im_start|>assistant\n%output%<|im_end|>", + "instruction,input,output": "<|im_start|>system\n<|im_end|>\n<|im_start|>user\n%instruction%: %input%<|im_end|>\n<|im_start|>assistant\n%output%<|im_end|>" +} diff --git a/training/formats/alpaca-chatbot-format.json b/training/formats/alpaca-chatbot-format.json old mode 100755 new mode 100644 diff --git a/training/formats/alpaca-format.json b/training/formats/alpaca-format.json old mode 100755 new mode 100644 diff --git a/training/formats/llama2-chat-format.json b/training/formats/llama2-chat-format.json old mode 100755 new mode 100644 diff --git a/training/formats/vicuna-format.json b/training/formats/vicuna-format.json old mode 100755 new mode 100644 diff --git a/update_wizard_linux.sh b/update_wizard_linux.sh index c5add61ecc2e4c58f27a4143b372ee74857283f4..3ada9a1e47fb6ca685b722ea5bdcbec1a8c01a8e 100755 --- a/update_wizard_linux.sh +++ b/update_wizard_linux.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash cd "$(dirname "${BASH_SOURCE[0]}")"