Spaces:

John6666
/

votepurchase-multiple-model

Running on Zero

App Files Files Community

votepurchase-multiple-model / llmdolphin.py

John6666

Upload 8 files

6c79e9c verified 1 day ago

raw

history blame contribute delete

17.5 kB

	import spaces
	import gradio as gr
	from pathlib import Path
	import re
	import torch
	import gc
	from typing import Any
	from huggingface_hub import hf_hub_download, HfApi
	from llama_cpp import Llama
	from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
	from llama_cpp_agent.providers import LlamaCppPythonProvider
	from llama_cpp_agent.chat_history import BasicChatHistory
	from llama_cpp_agent.chat_history.messages import Roles
	from ja_to_danbooru.ja_to_danbooru import jatags_to_danbooru_tags
	import wrapt_timeout_decorator
	from llama_cpp_agent.messages_formatter import MessagesFormatter
	from formatter import mistral_v1_formatter, mistral_v2_formatter, mistral_v3_tekken_formatter
	from llmenv import llm_models, llm_models_dir, llm_formats, llm_languages, dolphin_system_prompt
	import subprocess
	subprocess.run("rm -rf /data-nvme/zerogpu-offload/*", env={}, shell=True)


	llm_models_tupled_list = []
	default_llm_model_filename = list(llm_models.keys())[0]
	device = "cuda" if torch.cuda.is_available() else "cpu"


	def to_list(s: str):
	return [x.strip() for x in s.split(",") if not s == ""]


	def list_uniq(l: list):
	return sorted(set(l), key=l.index)


	DEFAULT_STATE = {
	"dolphin_sysprompt_mode": "Default",
	"dolphin_output_language": llm_languages[0],
	}


	def get_state(state: dict, key: str):
	if key in state.keys(): return state[key]
	elif key in DEFAULT_STATE.keys():
	print(f"State '{key}' not found. Use dedault value.")
	return DEFAULT_STATE[key]
	else:
	print(f"State '{key}' not found.")
	return None


	def set_state(state: dict, key: str, value: Any):
	state[key] = value


	@wrapt_timeout_decorator.timeout(dec_timeout=3.5)
	def to_list_ja(s: str):
	s = re.sub(r'[、。]', ',', s)
	return [x.strip() for x in s.split(",") if not s == ""]


	def is_japanese(s: str):
	import unicodedata
	for ch in s:
	name = unicodedata.name(ch, "")
	if "CJK UNIFIED" in name or "HIRAGANA" in name or "KATAKANA" in name:
	return True
	return False


	def update_llm_model_tupled_list():
	global llm_models_tupled_list
	llm_models_tupled_list = []
	for k, v in llm_models.items():
	name = k
	value = k
	llm_models_tupled_list.append((name, value))
	model_files = Path(llm_models_dir).glob('*.gguf')
	for path in model_files:
	name = path.name
	value = path.name
	llm_models_tupled_list.append((name, value))
	llm_models_tupled_list = list_uniq(llm_models_tupled_list)
	return llm_models_tupled_list


	def download_llm_models():
	global llm_models_tupled_list
	llm_models_tupled_list = []
	for k, v in llm_models.items():
	try:
	hf_hub_download(repo_id = v[0], filename = k, local_dir = llm_models_dir)
	except Exception:
	continue
	name = k
	value = k
	llm_models_tupled_list.append((name, value))


	def download_llm_model(filename: str):
	if not filename in llm_models.keys(): return default_llm_model_filename
	try:
	hf_hub_download(repo_id = llm_models[filename][0], filename = filename, local_dir = llm_models_dir)
	except Exception as e:
	print(e)
	return default_llm_model_filename
	update_llm_model_tupled_list()
	return filename


	def get_dolphin_model_info(filename: str):
	md = "None"
	items = llm_models.get(filename, None)
	if items:
	md = f'Repo: [{items[0]}](https://huggingface.co/{items[0]})'
	return md


	def select_dolphin_model(filename: str, state: dict, progress=gr.Progress(track_tqdm=True)):
	set_state(state, "override_llm_format", None)
	progress(0, desc="Loading model...")
	value = download_llm_model(filename)
	progress(1, desc="Model loaded.")
	md = get_dolphin_model_info(filename)
	return gr.update(value=value, choices=get_dolphin_models()), gr.update(value=get_dolphin_model_format(value)), gr.update(value=md), state


	def select_dolphin_format(format_name: str, state: dict):
	set_state(state, "override_llm_format", llm_formats[format_name])
	return gr.update(value=format_name), state


	download_llm_model(default_llm_model_filename)


	def get_dolphin_models():
	return update_llm_model_tupled_list()


	def get_llm_formats():
	return list(llm_formats.keys())


	def get_key_from_value(d, val):
	keys = [k for k, v in d.items() if v == val]
	if keys:
	return keys[0]
	return None


	def get_dolphin_model_format(filename: str):
	if not filename in llm_models.keys(): filename = default_llm_model_filename
	format = llm_models[filename][1]
	format_name = get_key_from_value(llm_formats, format)
	return format_name


	def add_dolphin_models(query: str, format_name: str):
	global llm_models
	api = HfApi()
	add_models = {}
	format = llm_formats[format_name]
	filename = ""
	repo = ""
	try:
	s = list(re.findall(r'^(?:https?://huggingface.co/)?(.+?/.+?)(?:/./(.+?.gguf).?)?$', query)[0])
	if s and "" in s: s.remove("")
	if len(s) == 1:
	repo = s[0]
	if not api.repo_exists(repo_id = repo): return gr.update()
	files = api.list_repo_files(repo_id = repo)
	for file in files:
	if str(file).endswith(".gguf"): add_models[filename] = [repo, format]
	elif len(s) >= 2:
	repo = s[0]
	filename = s[1]
	if not api.repo_exists(repo_id = repo) or not api.file_exists(repo_id = repo, filename = filename): return gr.update()
	add_models[filename] = [repo, format]
	else: return gr.update()
	except Exception as e:
	print(e)
	return gr.update()
	llm_models = (llm_models \| add_models).copy()
	update_llm_model_tupled_list()
	choices = get_dolphin_models()
	return gr.update(choices=choices, value=choices[-1][1])


	def get_dolphin_sysprompt(state: dict={}):
	dolphin_sysprompt_mode = get_state(state, "dolphin_sysprompt_mode")
	dolphin_output_language = get_state(state, "dolphin_output_language")
	prompt = re.sub('<LANGUAGE>', dolphin_output_language if dolphin_output_language else llm_languages[0],
	dolphin_system_prompt.get(dolphin_sysprompt_mode, dolphin_system_prompt[list(dolphin_system_prompt.keys())[0]]))
	return prompt


	def get_dolphin_sysprompt_mode():
	return list(dolphin_system_prompt.keys())


	def select_dolphin_sysprompt(key: str, state: dict):
	dolphin_sysprompt_mode = get_state(state, "dolphin_sysprompt_mode")
	if not key in dolphin_system_prompt.keys(): dolphin_sysprompt_mode = "Default"
	else: dolphin_sysprompt_mode = key
	set_state(state, "dolphin_sysprompt_mode", dolphin_sysprompt_mode)
	return gr.update(value=get_dolphin_sysprompt(state)), state


	def get_dolphin_languages():
	return llm_languages


	def select_dolphin_language(lang: str, state: dict):
	set_state(state, "dolphin_output_language", lang)
	return gr.update(value=get_dolphin_sysprompt(state)), state


	@wrapt_timeout_decorator.timeout(dec_timeout=5.0)
	def get_raw_prompt(msg: str):
	m = re.findall(r'/GENBEGIN/(.+?)/GENEND/', msg, re.DOTALL)
	return re.sub(r'[*/:_"#\n]', ' ', ", ".join(m)).lower() if m else ""


	@torch.inference_mode()
	@spaces.GPU(duration=59)
	def dolphin_respond(
	message: str,
	history: list[tuple[str, str]],
	model: str = default_llm_model_filename,
	system_message: str = get_dolphin_sysprompt(),
	max_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 0.95,
	top_k: int = 40,
	repeat_penalty: float = 1.1,
	state: dict = {},
	progress=gr.Progress(track_tqdm=True),
	):
	try:
	model_path = Path(f"{llm_models_dir}/{model}")
	if not model_path.exists(): raise gr.Error(f"Model file not found: {str(model_path)}")
	progress(0, desc="Processing...")
	override_llm_format = get_state(state, "override_llm_format")
	if override_llm_format: chat_template = override_llm_format
	else: chat_template = llm_models[model][1]

	llm = Llama(
	model_path=str(model_path),
	flash_attn=True,
	n_gpu_layers=81, # 81
	n_batch=1024,
	n_ctx=8192, #8192
	)
	provider = LlamaCppPythonProvider(llm)

	agent = LlamaCppAgent(
	provider,
	system_prompt=f"{system_message}",
	predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
	custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
	debug_output=False
	)

	settings = provider.get_provider_default_settings()
	settings.temperature = temperature
	settings.top_k = top_k
	settings.top_p = top_p
	settings.max_tokens = max_tokens
	settings.repeat_penalty = repeat_penalty
	settings.stream = True

	messages = BasicChatHistory()

	for msn in history:
	user = {
	'role': Roles.user,
	'content': msn[0]
	}
	assistant = {
	'role': Roles.assistant,
	'content': msn[1]
	}
	messages.add_message(user)
	messages.add_message(assistant)

	stream = agent.get_chat_response(
	message,
	llm_sampling_settings=settings,
	chat_history=messages,
	returns_streaming_generator=True,
	print_output=False
	)

	progress(0.5, desc="Processing...")

	outputs = ""
	for output in stream:
	outputs += output
	yield [(outputs, None)]
	except Exception as e:
	print(e)
	raise gr.Error(f"Error: {e}")
	#yield [("", None)]
	finally:
	torch.cuda.empty_cache()
	gc.collect()


	def dolphin_parse(
	history: list[tuple[str, str]],
	state: dict,
	):
	try:
	dolphin_sysprompt_mode = get_state(state, "dolphin_sysprompt_mode")
	if dolphin_sysprompt_mode == "Chat with LLM" or not history or len(history) < 1:
	return "", gr.update(), gr.update()
	msg = history[-1][0]
	raw_prompt = get_raw_prompt(msg)
	prompts = []
	if dolphin_sysprompt_mode == "Japanese to Danbooru Dictionary" and is_japanese(raw_prompt):
	prompts = list_uniq(jatags_to_danbooru_tags(to_list_ja(raw_prompt)) + ["nsfw", "explicit"])
	else:
	prompts = list_uniq(to_list(raw_prompt) + ["nsfw", "explicit"])
	return ", ".join(prompts), gr.update(interactive=True), gr.update(interactive=True)
	except Exception as e:
	print(e)
	return "", gr.update(), gr.update()


	@torch.inference_mode()
	@spaces.GPU(duration=59)
	def dolphin_respond_auto(
	message: str,
	history: list[tuple[str, str]],
	model: str = default_llm_model_filename,
	system_message: str = get_dolphin_sysprompt(),
	max_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 0.95,
	top_k: int = 40,
	repeat_penalty: float = 1.1,
	state: dict = {},
	progress=gr.Progress(track_tqdm=True),
	):
	try:
	model_path = Path(f"{llm_models_dir}/{model}")
	#if not is_japanese(message): return [(None, None)]
	progress(0, desc="Processing...")

	override_llm_format = get_state(state, "override_llm_format")
	if override_llm_format: chat_template = override_llm_format
	else: chat_template = llm_models[model][1]

	llm = Llama(
	model_path=str(model_path),
	flash_attn=True,
	n_gpu_layers=81, # 81
	n_batch=1024,
	n_ctx=8192, #8192
	)
	provider = LlamaCppPythonProvider(llm)

	agent = LlamaCppAgent(
	provider,
	system_prompt=f"{system_message}",
	predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
	custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
	debug_output=False
	)

	settings = provider.get_provider_default_settings()
	settings.temperature = temperature
	settings.top_k = top_k
	settings.top_p = top_p
	settings.max_tokens = max_tokens
	settings.repeat_penalty = repeat_penalty
	settings.stream = True

	messages = BasicChatHistory()

	for msn in history:
	user = {
	'role': Roles.user,
	'content': msn[0]
	}
	assistant = {
	'role': Roles.assistant,
	'content': msn[1]
	}
	messages.add_message(user)
	messages.add_message(assistant)

	progress(0, desc="Translating...")
	stream = agent.get_chat_response(
	message,
	llm_sampling_settings=settings,
	chat_history=messages,
	returns_streaming_generator=True,
	print_output=False
	)

	progress(0.5, desc="Processing...")

	outputs = ""
	for output in stream:
	outputs += output
	yield [(outputs, None)], gr.update(), gr.update()
	except Exception as e:
	print(e)
	yield [("", None)], gr.update(), gr.update()
	finally:
	torch.cuda.empty_cache()
	gc.collect()


	def dolphin_parse_simple(
	message: str,
	history: list[tuple[str, str]],
	state: dict,
	):
	try:
	#if not is_japanese(message): return message
	dolphin_sysprompt_mode = get_state(state, "dolphin_sysprompt_mode")
	if dolphin_sysprompt_mode == "Chat with LLM" or not history or len(history) < 1: return message
	msg = history[-1][0]
	raw_prompt = get_raw_prompt(msg)
	prompts = []
	if dolphin_sysprompt_mode == "Japanese to Danbooru Dictionary" and is_japanese(raw_prompt):
	prompts = list_uniq(jatags_to_danbooru_tags(to_list_ja(raw_prompt)) + ["nsfw", "explicit", "rating_explicit"])
	else:
	prompts = list_uniq(to_list(raw_prompt) + ["nsfw", "explicit", "rating_explicit"])
	return ", ".join(prompts)
	except Exception as e:
	print(e)
	return ""


	# https://huggingface.co/spaces/CaioXapelaum/GGUF-Playground
	import cv2
	cv2.setNumThreads(1)


	@torch.inference_mode()
	@spaces.GPU(duration=59)
	def respond_playground(
	message: str,
	history: list[tuple[str, str]],
	model: str = default_llm_model_filename,
	system_message: str = get_dolphin_sysprompt(),
	max_tokens: int = 1024,
	temperature: float = 0.7,
	top_p: float = 0.95,
	top_k: int = 40,
	repeat_penalty: float = 1.1,
	state: dict = {},
	progress=gr.Progress(track_tqdm=True),
	):
	try:
	model_path = Path(f"{llm_models_dir}/{model}")
	if not model_path.exists(): raise gr.Error(f"Model file not found: {str(model_path)}")
	override_llm_format = get_state(state, "override_llm_format")
	if override_llm_format: chat_template = override_llm_format
	else: chat_template = llm_models[model][1]

	llm = Llama(
	model_path=str(model_path),
	flash_attn=True,
	n_gpu_layers=81, # 81
	n_batch=1024,
	n_ctx=8192, #8192
	)
	provider = LlamaCppPythonProvider(llm)

	agent = LlamaCppAgent(
	provider,
	system_prompt=f"{system_message}",
	predefined_messages_formatter_type=chat_template if not isinstance(chat_template, MessagesFormatter) else None,
	custom_messages_formatter=chat_template if isinstance(chat_template, MessagesFormatter) else None,
	debug_output=False
	)

	settings = provider.get_provider_default_settings()
	settings.temperature = temperature
	settings.top_k = top_k
	settings.top_p = top_p
	settings.max_tokens = max_tokens
	settings.repeat_penalty = repeat_penalty
	settings.stream = True

	messages = BasicChatHistory()

	# Add user and assistant messages to the history
	for msn in history:
	user = {'role': Roles.user, 'content': msn[0]}
	assistant = {'role': Roles.assistant, 'content': msn[1]}
	messages.add_message(user)
	messages.add_message(assistant)

	# Stream the response
	stream = agent.get_chat_response(
	message,
	llm_sampling_settings=settings,
	chat_history=messages,
	returns_streaming_generator=True,
	print_output=False
	)

	outputs = ""
	for output in stream:
	outputs += output
	yield outputs
	except Exception as e:
	print(e)
	raise gr.Error(f"Error: {e}")
	#yield ""
	finally:
	torch.cuda.empty_cache()
	gc.collect()