Spaces:

vmuchinov
/

sysprompt

Running on Zero

App Files Files Community

vmuchinov commited on 3 days ago

Commit

f8310bd

•

1 Parent(s): 211550f

Upload 2 files

Browse files

Files changed (2) hide show

app.py +44 -15
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -6,26 +6,55 @@ import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
-model_id = "Qwen/Qwen2.5-Coder-3B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
-    device_map="auto",
     trust_remote_code=True,
-    token=ACCESS_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
-    token=ACCESS_TOKEN)
 tokenizer.use_default_system_prompt = False
 @spaces.GPU
 def generate(
@@ -35,45 +64,45 @@ def generate(
     temperature: float = 0.01,
     top_p: float = 1.00,
 ) -> Iterator[str]:
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    '''
-    terminators = [
-        tokenizer.eos_token_id,
-        tokenizer.convert_tokens_to_ids("<|eot_id|>")
-    ]
-    '''
     streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
-        #eos_token_id=terminators,
         do_sample=True,
         top_p=top_p,
         temperature=temperature,
         num_beams=1,
-        pad_token_id=tokenizer.eos_token_id,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
 chat_interface = gr.Interface(
     fn=generate,
     inputs=[
@@ -104,7 +133,7 @@ chat_interface = gr.Interface(
             value=1.0,
         ),
     ],
-    title="Model testing - Qwen/Qwen2.5-Coder-3B-Instruct",
     description="Provide system settings and a prompt to interact with the model.",
 )

 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+import deepspeed
+# Configurable constants
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
+# Model ID for Qwen model
+model_id = "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8"
+# DeepSpeed configuration
+deepspeed_config = {
+    "train_batch_size": 1,
+    "fp16": {
+        "enabled": True
+    },
+    "zero_optimization": {
+        "stage": 3,  # Enable ZeRO stage 3 for maximum memory efficiency
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": True
+        },
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": True
+        }
+    },
+    "gradient_checkpointing": True  # Enables gradient checkpointing for further memory savings
+}
+# Load model with DeepSpeed
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
     trust_remote_code=True,
+    device_map="auto",  # Use device mapping with DeepSpeed
+    load_in_8bit=True,  # Use 8-bit quantization
+    token=ACCESS_TOKEN
+)
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
+    token=ACCESS_TOKEN
+)
 tokenizer.use_default_system_prompt = False
+# Initialize DeepSpeed for the loaded model
+model = deepspeed.init_inference(model, config=deepspeed_config, mp_size=1)
 @spaces.GPU
 def generate(
     temperature: float = 0.01,
     top_p: float = 1.00,
 ) -> Iterator[str]:
+    # Define the conversation prompt format
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
+    # Tokenize the input with the conversation template
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    # Set up the text streaming options for output
     streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
+    # Set up generation parameters
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         temperature=temperature,
         num_beams=1,
+        pad_token_id=tokenizer.eos_token_id,
     )
+    # Run generation in a separate thread for streaming
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
+    # Stream the output text
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
+# Gradio interface setup
 chat_interface = gr.Interface(
     fn=generate,
     inputs=[
             value=1.0,
         ),
     ],
+    title="Model testing - Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8",
     description="Provide system settings and a prompt to interact with the model.",
 )

requirements.txt CHANGED Viewed

@@ -246,3 +246,4 @@ einops
 pytest
 gguf>=0.10.0
 autoawq

 pytest
 gguf>=0.10.0
 autoawq
+deepspeed