Spaces:

kgourgou
/

llm-decoders

Sleeping

App Files Files Community

kgourgou commited on Feb 12

Commit

2034a72

verified ·

1 Parent(s): 59f8a34

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -18

app.py CHANGED Viewed

@@ -1,12 +1,8 @@
 import gradio as gr
 import torch
 import concurrent.futures
-import threading
 from transformers import AutoTokenizer, AutoModelForCausalLM
-# Create a lock to serialize access to the model
-model_lock = threading.Lock()
 # Load the model and tokenizer (using GPT-2 as an example)
 model_name = "gpt2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -52,19 +48,12 @@ def generate_completion(prompt, strategy, params):
     Generate a complete answer using model.generate with specified parameters.
     """
     # Encode the prompt and get the attention mask.
-    tokenizer.pad_token = tokenizer.eos_token
-    encoded = tokenizer(prompt, return_tensors="pt", padding=True)
     input_ids = encoded["input_ids"]
     attention_mask = encoded["attention_mask"]
-    # Use the lock when calling the model
-    with model_lock:
-        output_ids = model.generate(
-            input_ids,
-            attention_mask=attention_mask,
-            max_length=50,
-            **params
-        )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
@@ -76,9 +65,7 @@ def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
     # Generate up to max_length tokens.
     for _ in range(max_length - input_ids.size(1)):
-        # Lock the model call to ensure thread safety.
-        with model_lock:
-            outputs = model(input_ids)
         logits = outputs.logits[:, -1, :]  # Get logits for the last token.
         next_token = min_p_sampling(logits, pbase=pbase)
@@ -96,6 +83,7 @@ def generate_all(prompt):
     Run multiple decoding strategies concurrently and yield updates as each completes.
     """
     # Define each decoding strategy and its parameters.
     methods = {
         "Greedy": {"type": "default", "params": {"do_sample": False}},
         "Top-k Sampling": {"type": "default", "params": {"do_sample": True, "top_k": 50}},
@@ -144,7 +132,7 @@ interface = gr.Interface(
         gr.Textbox(label="Min-p Sampling"),
     ],
     title="Decoding Methods Comparison",
-    description="""This uses GPT2. min-p sampling is from Nguyen, M., et al, 2024, "Turning up the heat: Min-p sampling for creative and coherent llm outputs. arXiv preprint arXiv:2407.01082."""
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
 import concurrent.futures
 from transformers import AutoTokenizer, AutoModelForCausalLM
 # Load the model and tokenizer (using GPT-2 as an example)
 model_name = "gpt2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
     Generate a complete answer using model.generate with specified parameters.
     """
     # Encode the prompt and get the attention mask.
+    encoded = tokenizer(prompt, return_tensors="pt")
     input_ids = encoded["input_ids"]
     attention_mask = encoded["attention_mask"]
+    # Generate the output.
+    output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, **params)
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
     # Generate up to max_length tokens.
     for _ in range(max_length - input_ids.size(1)):
+        outputs = model(input_ids)
         logits = outputs.logits[:, -1, :]  # Get logits for the last token.
         next_token = min_p_sampling(logits, pbase=pbase)
     Run multiple decoding strategies concurrently and yield updates as each completes.
     """
     # Define each decoding strategy and its parameters.
+    # For the default strategies, we use model.generate; for "Min‑p Sampling" we use our custom function.
     methods = {
         "Greedy": {"type": "default", "params": {"do_sample": False}},
         "Top-k Sampling": {"type": "default", "params": {"do_sample": True, "top_k": 50}},
         gr.Textbox(label="Min-p Sampling"),
     ],
     title="Decoding Methods Comparison",
+    description="Each decoding method's final answer is printed as soon as it is done, including custom min-p sampling."
 )
 if __name__ == "__main__":