Spaces:

kgourgou
/

llm-decoders

Sleeping

App Files Files Community

kgourgou commited on Feb 12

Commit

0044085

verified ·

1 Parent(s): 19e5757

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -11

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
-"""
-Fun little experiment.
-"""
 import gradio as gr
 import torch
 import concurrent.futures
 from transformers import AutoTokenizer, AutoModelForCausalLM
 model_name = "gpt2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
@@ -52,13 +52,18 @@ def generate_completion(prompt, strategy, params):
     Generate a complete answer using model.generate with specified parameters.
     """
     # Encode the prompt and get the attention mask.
-    tokenizer.pad_token = tokenizer.eos_token
     encoded = tokenizer(prompt, return_tensors="pt", padding=True)
     input_ids = encoded["input_ids"]
     attention_mask = encoded["attention_mask"]
-    # Generate the output.
-    output_ids = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=50, **params)
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
@@ -70,7 +75,9 @@ def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
     # Generate up to max_length tokens.
     for _ in range(max_length - input_ids.size(1)):
-        outputs = model(input_ids)
         logits = outputs.logits[:, -1, :]  # Get logits for the last token.
         next_token = min_p_sampling(logits, pbase=pbase)
@@ -88,7 +95,6 @@ def generate_all(prompt):
     Run multiple decoding strategies concurrently and yield updates as each completes.
     """
     # Define each decoding strategy and its parameters.
-    # For the default strategies, we use model.generate; for "Min‑p Sampling" we use our custom function.
     methods = {
         "Greedy": {"type": "default", "params": {"do_sample": False}},
         "Top-k Sampling": {"type": "default", "params": {"do_sample": True, "top_k": 50}},
@@ -137,7 +143,7 @@ interface = gr.Interface(
         gr.Textbox(label="Min-p Sampling"),
     ],
     title="Decoding Methods Comparison",
-    description="Each decoding method's final answer is printed as soon as it is done. This uses GPT2."
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
 import concurrent.futures
+import threading
 from transformers import AutoTokenizer, AutoModelForCausalLM
+# Create a lock to serialize access to the model
+model_lock = threading.Lock()
+# Load the model and tokenizer (using GPT-2 as an example)
 model_name = "gpt2"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(model_name)
     Generate a complete answer using model.generate with specified parameters.
     """
     # Encode the prompt and get the attention mask.
     encoded = tokenizer(prompt, return_tensors="pt", padding=True)
     input_ids = encoded["input_ids"]
     attention_mask = encoded["attention_mask"]
+    # Use the lock when calling the model
+    with model_lock:
+        output_ids = model.generate(
+            input_ids,
+            attention_mask=attention_mask,
+            max_length=50,
+            **params
+        )
     return tokenizer.decode(output_ids[0], skip_special_tokens=True)
 def generate_min_p_completion(prompt, pbase=0.1, max_length=50):
     # Generate up to max_length tokens.
     for _ in range(max_length - input_ids.size(1)):
+        # Lock the model call to ensure thread safety.
+        with model_lock:
+            outputs = model(input_ids)
         logits = outputs.logits[:, -1, :]  # Get logits for the last token.
         next_token = min_p_sampling(logits, pbase=pbase)
     Run multiple decoding strategies concurrently and yield updates as each completes.
     """
     # Define each decoding strategy and its parameters.
     methods = {
         "Greedy": {"type": "default", "params": {"do_sample": False}},
         "Top-k Sampling": {"type": "default", "params": {"do_sample": True, "top_k": 50}},
         gr.Textbox(label="Min-p Sampling"),
     ],
     title="Decoding Methods Comparison",
+    description="Each decoding method's final answer is printed as soon as it is done, including custom min-p sampling."
 )
 if __name__ == "__main__":