Spaces:

vmuchinov
/

sysprompt

Running on Zero

App Files Files Community

vmuchinov commited on 3 days ago

Commit

e4cd356

•

1 Parent(s): f8310bd

Upload 2 files

Browse files

Files changed (2) hide show

app.py +15 -44
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -6,55 +6,26 @@ import gradio as gr
 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
-import deepspeed
-# Configurable constants
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
-# Model ID for Qwen model
-model_id = "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8"
-# DeepSpeed configuration
-deepspeed_config = {
-    "train_batch_size": 1,
-    "fp16": {
-        "enabled": True
-    },
-    "zero_optimization": {
-        "stage": 3,  # Enable ZeRO stage 3 for maximum memory efficiency
-        "offload_param": {
-            "device": "cpu",
-            "pin_memory": True
-        },
-        "offload_optimizer": {
-            "device": "cpu",
-            "pin_memory": True
-        }
-    },
-    "gradient_checkpointing": True  # Enables gradient checkpointing for further memory savings
-}
-# Load model with DeepSpeed
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
     trust_remote_code=True,
-    device_map="auto",  # Use device mapping with DeepSpeed
-    load_in_8bit=True,  # Use 8-bit quantization
-    token=ACCESS_TOKEN
-)
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
-    token=ACCESS_TOKEN
-)
 tokenizer.use_default_system_prompt = False
-# Initialize DeepSpeed for the loaded model
-model = deepspeed.init_inference(model, config=deepspeed_config, mp_size=1)
 @spaces.GPU
 def generate(
@@ -64,45 +35,45 @@ def generate(
     temperature: float = 0.01,
     top_p: float = 1.00,
 ) -> Iterator[str]:
-    # Define the conversation prompt format
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
-    # Tokenize the input with the conversation template
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
-    # Set up the text streaming options for output
     streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
-    # Set up generation parameters
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         top_p=top_p,
         temperature=temperature,
         num_beams=1,
-        pad_token_id=tokenizer.eos_token_id,
     )
-    # Run generation in a separate thread for streaming
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    # Stream the output text
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
-# Gradio interface setup
 chat_interface = gr.Interface(
     fn=generate,
     inputs=[
@@ -133,7 +104,7 @@ chat_interface = gr.Interface(
             value=1.0,
         ),
     ],
-    title="Model testing - Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int8",
     description="Provide system settings and a prompt to interact with the model.",
 )

 import spaces
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 ACCESS_TOKEN = os.getenv("HF_TOKEN", "")
+model_id = "Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4"
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
     torch_dtype=torch.float16,
+    device_map="auto",
     trust_remote_code=True,
+    token=ACCESS_TOKEN)
 tokenizer = AutoTokenizer.from_pretrained(
     model_id,
     trust_remote_code=True,
+    token=ACCESS_TOKEN)
 tokenizer.use_default_system_prompt = False
 @spaces.GPU
 def generate(
     temperature: float = 0.01,
     top_p: float = 1.00,
 ) -> Iterator[str]:
     conversation = []
     if system_prompt:
         conversation.append({"role": "system", "content": system_prompt})
     conversation.append({"role": "user", "content": message})
     input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt")
     if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
         input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
     input_ids = input_ids.to(model.device)
+    '''
+    terminators = [
+        tokenizer.eos_token_id,
+        tokenizer.convert_tokens_to_ids("<|eot_id|>")
+    ]
+    '''
     streamer = TextIteratorStreamer(tokenizer, timeout=600.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
         {"input_ids": input_ids},
         streamer=streamer,
         max_new_tokens=max_new_tokens,
+        #eos_token_id=terminators,
         do_sample=True,
         top_p=top_p,
         temperature=temperature,
         num_beams=1,
+        pad_token_id=tokenizer.eos_token_id,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for text in streamer:
         outputs.append(text)
         yield "".join(outputs)
 chat_interface = gr.Interface(
     fn=generate,
     inputs=[
             value=1.0,
         ),
     ],
+    title="Model testing - Qwen/Qwen2.5-Coder-32B-Instruct-GPTQ-Int4",
     description="Provide system settings and a prompt to interact with the model.",
 )

requirements.txt CHANGED Viewed

@@ -246,4 +246,3 @@ einops
 pytest
 gguf>=0.10.0
 autoawq
-deepspeed

 pytest
 gguf>=0.10.0
 autoawq