Spaces:

KwabsHug
/

GameConfigIdea

Running on Zero

File size: 12,476 Bytes

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gc
import sys
from diffusers import FluxPipeline
import time
from sentence_transformers import SentenceTransformer
import psutil
import json
import spaces
from threading import Thread
#-----------------
from relatively_constant_variables import knowledge_base

# Initialize the zero tensor on CUDA
zero = torch.Tensor([0]).cuda()
print(zero.device)  # This will print 'cpu' outside the @spaces.GPU decorated function

modelnames = ["stvlynn/Gemma-2-2b-Chinese-it", "unsloth/Llama-3.2-1B-Instruct", "unsloth/Llama-3.2-3B-Instruct", "nbeerbower/mistral-nemo-wissenschaft-12B", "princeton-nlp/gemma-2-9b-it-SimPO", "cognitivecomputations/dolphin-2.9.3-mistral-7B-32k", "01-ai/Yi-Coder-9B-Chat", "ArliAI/Llama-3.1-8B-ArliAI-RPMax-v1.1", "ArliAI/Phi-3.5-mini-3.8B-ArliAI-RPMax-v1.1", 
              "Qwen/Qwen2.5-7B-Instruct", "Qwen/Qwen2-0.5B-Instruct", "Qwen/Qwen2-1.5B-Instruct", "Qwen/Qwen2-7B-Instruct", "Qwen/Qwen1.5-MoE-A2.7B-Chat", "HuggingFaceTB/SmolLM-135M-Instruct", "microsoft/Phi-3-mini-4k-instruct", "Groq/Llama-3-Groq-8B-Tool-Use", "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4", 
              "SpectraSuite/TriLM_3.9B_Unpacked", "h2oai/h2o-danube3-500m-chat", "OuteAI/Lite-Mistral-150M-v2-Instruct", "Zyphra/Zamba2-1.2B", "anthracite-org/magnum-v2-4b", ]

imagemodelnames = ["black-forest-labs/FLUX.1-schnell"]

current_model_index = 0
current_image_model_index = 0
modelname = modelnames[current_model_index]
imagemodelname = imagemodelnames[current_image_model_index]
lastmodelnameinloadfunction = None
lastimagemodelnameinloadfunction = None

# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize model and tokenizer as global variables
model = None
tokenizer = None
flux_pipe = None

# Dictionary to store loaded models
loaded_models = {}

def get_size_str(bytes):
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes < 1024:
            return f"{bytes:.2f} {unit}"
        bytes /= 1024

def load_model(model_name):
    global model, tokenizer, lastmodelnameinloadfunction, loaded_models

    print(f"Loading model and tokenizer: {model_name}")
    
    # Record initial GPU memory usage
    initial_memory = torch.cuda.memory_allocated()

    # Clear old model and tokenizer if they exist
    if 'model' in globals() and model is not None:
        model = None
    if 'tokenizer' in globals() and tokenizer is not None:
        tokenizer = None
    
    torch.cuda.empty_cache()
    gc.collect()

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype="auto",
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model_size = sum(p.numel() * p.element_size() for p in model.parameters())
    tokenizer_size = sum(sys.getsizeof(v) for v in tokenizer.__dict__.values())
    loaded_models[model_name] = (model, tokenizer)

    # Calculate memory usage
    final_memory = torch.cuda.memory_allocated()
    memory_used = final_memory - initial_memory

    loaded_models[model_name] = [str(time.time()), memory_used]

    lastmodelnameinloadfunction = (model_name, model_size, tokenizer_size)
    print(f"Model and tokenizer {model_name} loaded successfully")
    print(f"Model size: {get_size_str(model_size)}")
    print(f"Tokenizer size: {get_size_str(tokenizer_size)}")
    print(f"GPU memory used: {get_size_str(memory_used)}")

    return (f"Model and tokenizer {model_name} loaded successfully. "
            f"Model size: {get_size_str(model_size)}, "
            f"Tokenizer size: {get_size_str(tokenizer_size)}, "
            f"GPU memory used: {get_size_str(memory_used)}")

def load_image_model(imagemodelname):
    global flux_pipe, lastimagemodelnameinloadfunction, loaded_models

    print(f"Loading image model: {imagemodelname}")
    
    # Record initial GPU memory usage
    initial_memory = torch.cuda.memory_allocated()

    if 'flux_pipe' in globals() and flux_pipe is not None:
        flux_pipe = None
    
    torch.cuda.empty_cache()
    gc.collect()

    flux_pipe = FluxPipeline.from_pretrained(imagemodelname, torch_dtype=torch.bfloat16)
    flux_pipe.enable_model_cpu_offload()
    model_size = sum(p.numel() * p.element_size() for p in flux_pipe.transformer.parameters())
    #tokenizer_size = 0  # FLUX doesn't use a separate tokenizer
    loaded_models[imagemodelname] = flux_pipe

    # Calculate memory usage
    final_memory = torch.cuda.memory_allocated()
    memory_used = final_memory - initial_memory

    loaded_models[imagemodelname] = [str(time.time()), memory_used]

    lastimagemodelnameinloadfunction = (imagemodelname, model_size) #, tokenizer_size)
    print(f"Model and tokenizer {imagemodelname} loaded successfully")
    print(f"Model size: {get_size_str(model_size)}")
    #print(f"Tokenizer size: {get_size_str(tokenizer_size)}")
    print(f"GPU memory used: {get_size_str(memory_used)}")

    return (f"Model and tokenizer {imagemodelname} loaded successfully. "
            f"Model size: {get_size_str(model_size)}, "
            #f"Tokenizer size: {get_size_str(tokenizer_size)}, "
            f"GPU memory used: {get_size_str(memory_used)}")


def clear_all_models():
    global model, tokenizer, flux_pipe, loaded_models
    for model_name, model_obj in loaded_models.items():
        if isinstance(model_obj, tuple):
            model_obj[0].to('cpu')
            del model_obj[0]
            del model_obj[1]
        else:
            model_obj.to('cpu')
            del model_obj
    model = None
    tokenizer = None
    flux_pipe = None
    loaded_models.clear()
    torch.cuda.empty_cache()
    gc.collect()
    return "All models cleared from memory."

def load_model_list(model_list):
    messages = []
    for model_name in model_list:
        message = load_model(model_name)
        messages.append(message)
    return "\n".join(messages)

def loaded_model_list():
    global loaded_models
    return loaded_models


# Initial model load
load_model(modelname)
load_image_model(imagemodelname)

# Create embeddings for the knowledge base
knowledge_base_embeddings = embedding_model.encode([doc["content"] for doc in knowledge_base])

def retrieve(query, k=2):
    query_embedding = embedding_model.encode([query])
    similarities = torch.nn.functional.cosine_similarity(torch.tensor(query_embedding), torch.tensor(knowledge_base_embeddings))
    top_k_indices = similarities.argsort(descending=True)[:k]
    return [(knowledge_base[i]["content"], knowledge_base[i]["id"]) for i in top_k_indices]

def get_ram_usage():
    ram = psutil.virtual_memory()
    return f"RAM Usage: {ram.percent:.2f}%, Available: {ram.available / (1024 ** 3):.2f}GB, Total: {ram.total / (1024 ** 3):.2f}GB"

# Global dictionary to store outputs
output_dict = {}

def empty_output_dict():
    global output_dict
    output_dict = {}
    print("Output dictionary has been emptied.")

def get_model_details(model):
    return {
        "name": model.config.name_or_path,
        "architecture": model.config.architectures[0] if model.config.architectures else "Unknown",
        "num_parameters": sum(p.numel() for p in model.parameters()),
    }

def get_tokenizer_details(tokenizer):
    return {
        "name": tokenizer.__class__.__name__,
        "vocab_size": tokenizer.vocab_size,
        "model_max_length": tokenizer.model_max_length,
    }

@spaces.GPU
def generate_response(prompt, use_rag, stream=False):
    global output_dict, model, tokenizer
    
    print(zero.device)  # This will print 'cuda:0' inside the @spaces.GPU decorated function
    torch.cuda.empty_cache()
    print(dir(model))

    if use_rag:
        retrieved_docs = retrieve(prompt)
        context = " ".join([doc for doc, _ in retrieved_docs])
        doc_ids = [doc_id for _, doc_id in retrieved_docs]
        full_prompt = f"Context: {context}\nQuestion: {prompt}\nAnswer:"
    else:
        full_prompt = prompt
        doc_ids = None
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": full_prompt}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(zero.device)
    start_time = time.time()
    total_tokens = 0
    
    print(output_dict)
    output_key = f"output_{len(output_dict) + 1}"
    print(output_key)
    output_dict[output_key] = {
        "input_prompt": prompt,
        "full_prompt": full_prompt,
        "use_rag": use_rag,
        "generated_text": "",
        "tokens_per_second": 0,
        "ram_usage": "",
        "doc_ids": doc_ids if doc_ids else "N/A",
        "model_details": get_model_details(model),
        "tokenizer_details": get_tokenizer_details(tokenizer),
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time))
    }
    print(output_dict)

    if stream:
        streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
        generation_kwargs = dict(
            model_inputs,
            streamer=streamer,
            max_new_tokens=512,
            temperature=0.7,
        )
        thread = Thread(target=model.generate, kwargs=generation_kwargs)
        thread.start()
        for new_text in streamer:
            output_dict[output_key]["generated_text"] += new_text
            total_tokens += 1
            current_time = time.time()
            tokens_per_second = total_tokens / (current_time - start_time)
            ram_usage = get_ram_usage()
            output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
            output_dict[output_key]["ram_usage"] = ram_usage
            yield (output_dict[output_key]["generated_text"], 
                output_dict[output_key]["tokens_per_second"], 
                output_dict[output_key]["ram_usage"], 
                output_dict[output_key]["doc_ids"])
    else:
        generated_ids = model.generate(
            model_inputs.input_ids,
            max_new_tokens=512
        )
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]
        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
        total_tokens = len(generated_ids[0])
        end_time = time.time()
        tokens_per_second = total_tokens / (end_time - start_time)
        ram_usage = get_ram_usage()
        
        output_dict[output_key]["generated_text"] = response
        output_dict[output_key]["tokens_per_second"] = f"{tokens_per_second:.2f}"
        output_dict[output_key]["ram_usage"] = ram_usage
        print(output_dict)

        yield (output_dict[output_key]["generated_text"], 
            output_dict[output_key]["tokens_per_second"], 
            output_dict[output_key]["ram_usage"], 
            output_dict[output_key]["doc_ids"])
            
@spaces.GPU
def generate_image(prompt):
    global output_dict, flux_pipe
    
    print(dir(flux_pipe))

    # Generate image using FLUX
    image = flux_pipe(
        prompt,
        guidance_scale=0.0,
        num_inference_steps=4,
        max_sequence_length=256,
        generator=torch.Generator("cpu").manual_seed(0)
    ).images[0]
    image_path = f"flux_output_{time.time()}.png"
    print(image_path)
    image.save(image_path)
    ram_usage = get_ram_usage()
    return image_path, ram_usage, image_path

def get_output_details(output_key):
    if output_key in output_dict:
        return output_dict[output_key]
    else:
        return f"No output found for key: {output_key}"

# Update the switch_model function to return the load_model message
def switch_model(choice):
    global modelname
    modelname = choice
    load_message = load_model(modelname)
    return load_message, f"Current model: {modelname}"

# Update the model_change_handler function
def model_change_handler(choice):
    message, current_model = switch_model(choice)
    return message, current_model, message  # Use the same message for both outputs

def format_output_dict():
    global output_dict
    formatted_output = ""
    for key, value in output_dict.items():
        formatted_output += f"Key: {key}\n"
        formatted_output += json.dumps(value, indent=2)
        formatted_output += "\n\n"
    print(formatted_output)
    return formatted_output