File size: 9,777 Bytes
0aca33c
e1724e4
 
0aca33c
 
0044085
fb881f7
0aca33c
 
58318c4
 
 
 
0aca33c
2e20bf5
 
c816679
 
58318c4
2e20bf5
 
 
58318c4
2e20bf5
 
 
 
 
58318c4
2e20bf5
 
58318c4
2e20bf5
 
58318c4
2e20bf5
 
 
 
 
58318c4
2e20bf5
58318c4
2e20bf5
 
 
58318c4
2e20bf5
 
58318c4
c816679
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e1724e4
2e20bf5
 
 
eab7f9b
 
 
 
 
 
 
 
c816679
eab7f9b
e1724e4
 
58318c4
c816679
2e20bf5
58318c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e20bf5
 
58318c4
e1724e4
2e20bf5
 
 
 
e1724e4
2e20bf5
58318c4
 
c816679
58318c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2e20bf5
c816679
 
 
 
 
 
 
 
0aca33c
58318c4
2e20bf5
58318c4
 
 
 
 
 
 
 
c816679
 
58318c4
e1724e4
58318c4
2e20bf5
e1724e4
58318c4
2e20bf5
e1724e4
2e20bf5
 
 
58318c4
 
 
2e20bf5
58318c4
 
 
c816679
 
 
 
 
2e20bf5
58318c4
2e20bf5
e1724e4
 
 
 
 
 
 
eab7f9b
2e20bf5
58318c4
 
 
 
 
0aca33c
2e20bf5
0aca33c
e1724e4
bf63770
0aca33c
bf63770
 
 
e1724e4
c816679
58318c4
 
c816679
 
 
 
 
 
0aca33c
2e20bf5
58318c4
0aca33c
 
 
58318c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import gradio as gr
import torch
import concurrent.futures
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the model and tokenizer (using GPT-2 as an example)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
model.eval()

torch.set_num_threads(2)


def min_p_sampling(logits, pbase=0.1):
    """
    Perform min-p sampling on the logits. As described in
    https://arxiv.org/abs/2407.01082

    Args:
        logits (torch.Tensor): 1D tensor of logits for the next token.
        pbase (float): Base probability to scale pmax.

    Returns:
        int: The sampled token index.
    """
    # Convert logits to probabilities.
    probs = torch.softmax(logits, dim=-1)

    # 1. Find maximum probability.
    pmax = probs.max()

    # 2. Compute the dynamic threshold.
    pscaled = pbase * pmax

    # 3. Create a mask of tokens with probability >= pscaled.
    mask = probs >= pscaled
    # In the unlikely event that no token meets the threshold, use the full distribution.
    if mask.sum() == 0:
        mask = torch.ones_like(probs, dtype=torch.bool)

    probs_filtered = probs * mask.float()

    # 4. Normalize and sample.
    probs_normalized = probs_filtered / probs_filtered.sum()
    sampled_index = torch.multinomial(probs_normalized, num_samples=1)

    return sampled_index.item()


def generate_laconic_completion(prompt: str, n: int = 5, max_length: int = 100):
    # generate n completions greedily and return the shortest one
    with torch.no_grad():
        # Encode the prompt and get the attention mask.
        encoded = tokenizer(prompt, return_tensors="pt")
        input_ids = encoded["input_ids"]
        attention_mask = encoded["attention_mask"]

        # Generate the output.
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            num_return_sequences=n,
            do_sample=True,
        )
        completions = [
            tokenizer.decode(output, skip_special_tokens=True) for output in outputs
        ]
        return min(completions, key=len)


def generate_with_confidence(input_ids, max_length):
    """
    Generate a sequence using greedy decoding while returning the scores.
    """
    outputs = model.generate(
        input_ids,
        max_length=max_length,
        do_sample=False,
        output_scores=True,
        return_dict_in_generate=True,
    )
    return outputs


def compute_answer_confidence(outputs):
    """
    Compute the answer confidence over the generated tokens.
    For each generated token, compute the difference between the top-1 and top-2 logits.
    Returns the average difference.
    """
    diffs = []
    for score in outputs.scores:
        # Get top-2 logit values
        top2 = torch.topk(score[0], 2)
        diff = top2.values[0] - top2.values[1]
        diffs.append(diff.item())

    return sum(diffs) / len(diffs) if diffs else 0.0


def cot_decoding(prompt, k=5, max_length=100):
    """
    Perform Chain-of-Thought (CoT) decoding by exploring top-k alternative paths.
    """
    input_ids = tokenizer.encode(prompt, return_tensors="pt")

    # Get logits for the next token
    with torch.no_grad():
        outputs = model(input_ids)
    logits = outputs.logits[0, -1, :]

    # Get top-k candidate tokens
    topk = torch.topk(logits, k)
    candidate_tokens = topk.indices

    paths = []
    for token in candidate_tokens:
        # Append the candidate token to the prompt
        new_input_ids = torch.cat([input_ids, token.view(1, 1)], dim=1)

        # Generate a full sequence with output scores
        gen_outputs = generate_with_confidence(
            new_input_ids, max_length=new_input_ids.shape[1] + max_length
        )

        # Decode the generated sequence
        generated_text = tokenizer.decode(
            gen_outputs.sequences[0], skip_special_tokens=True
        )

        # Compute answer confidence
        confidence = compute_answer_confidence(gen_outputs)

        paths.append({"text": generated_text, "confidence": confidence})

    return max(paths, key=lambda x: x["confidence"])["text"]


def generate_completion(prompt, strategy, params):
    """
    Generate a complete answer using model.generate with specified parameters.
    """
    with torch.no_grad():
        # Encode the prompt and get the attention mask.
        encoded = tokenizer(prompt, return_tensors="pt")
        input_ids = encoded["input_ids"]
        attention_mask = encoded["attention_mask"]

        # Generate the output.
        output_ids = model.generate(
            input_ids, attention_mask=attention_mask, max_length=100, **params
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


def generate_min_p_completion(prompt, pbase=0.1, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    past = None
    with torch.no_grad():
        for _ in range(max_length - input_ids.size(1)):
            # Only pass the last token if past is available
            outputs = (
                model(input_ids[:, -1:], past_key_values=past)
                if past is not None
                else model(input_ids)
            )
            past = outputs.past_key_values
            logits = outputs.logits[:, -1, :]

            next_token = min_p_sampling(logits, pbase=pbase)
            input_ids = torch.cat([input_ids, torch.tensor([[next_token]])], dim=-1)
            if next_token == tokenizer.eos_token_id:
                break
    return tokenizer.decode(input_ids[0], skip_special_tokens=True)


def generate_all(prompt):
    """
    Run multiple decoding strategies concurrently and yield updates as each completes.
    """
    # Define each decoding strategy and its parameters.
    methods = {
        "Greedy": {"type": "default", "params": {"do_sample": False}},
        "Top-k Sampling": {
            "type": "default",
            "params": {"do_sample": True, "top_k": 100},
        },
        "Top-p Sampling": {
            "type": "default",
            "params": {"do_sample": True, "top_p": 0.95},
        },
        "Beam Search": {
            "type": "default",
            "params": {"num_beams": 5, "early_stopping": True},
        },
        "Eta Sampling": {
            "type": "default",
            "params": {"do_sample": True, "eta_cutoff": 0.3},
        },
        "Epsilon Sampling": {
            "type": "default",
            "params": {"do_sample": True, "epsilon_cutoff": 0.2},
        },
        "Min-p Sampling": {"type": "min_p", "pbase": 0.1},
        "laconic": {
            "type": "default",
            "params": {"do_sample": True, "num_return_sequences": 5},
        },
        "COT Decoding": {
            "type": "cot_decoding",
            "params": {"k": 5, "max_length": 100},
        },
    }

    # Define the order for display.
    method_order = [
        "Greedy",
        "Top-k Sampling",
        "Top-p Sampling",
        "Beam Search",
        "Min-p Sampling",
        "Eta Sampling",
        "Epsilon Sampling",
        "laconic",
        "COT Decoding",
    ]
    results = {method: None for method in methods}

    # Yield an initial placeholder state.
    yield tuple("Processing..." for _ in method_order)

    # Use a thread pool to run each generation concurrently.
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future_to_method = {}
        for method, info in methods.items():
            if info["type"] == "default":
                future = executor.submit(
                    generate_completion, prompt, method, info["params"]
                )
            elif info["type"] == "min_p":
                future = executor.submit(
                    generate_min_p_completion, prompt, info["pbase"]
                )
            elif method == "laconic":
                future = executor.submit(generate_laconic_completion, prompt)
            elif method == "COT Decoding":
                future = executor.submit(cot_decoding, prompt, **info["params"])

            future_to_method[future] = method

        # As each future completes, update its result and yield the current state.
        for future in concurrent.futures.as_completed(future_to_method):
            method = future_to_method[future]
            try:
                result = future.result()
            except Exception as exc:
                result = f"Error: {exc}"
            results[method] = result

            # Yield the results in the pre-defined order; pending methods show "Processing..."
            yield tuple(
                results[m] if results[m] is not None else "Processing..."
                for m in method_order
            )


# Create the Gradio interface.
interface = gr.Interface(
    fn=generate_all,
    inputs=gr.Textbox(lines=3, placeholder="Enter your prompt here...", label="Prompt"),
    outputs=[
        gr.Textbox(label="Greedy"),
        gr.Textbox(label="Top-k Sampling"),
        gr.Textbox(label="Top-p Sampling"),
        gr.Textbox(label="Beam Search"),
        gr.Textbox(label="Min-p Sampling (as in https://arxiv.org/abs/2407.01082)"),
        gr.Textbox(label="Eta Sampling"),
        gr.Textbox(label="Epsilon Sampling"),
        gr.Textbox(
            label="laconic decoding (by Alex Dimakis, 2025, search for twitter thread)"
        ),
        gr.Textbox(
            label="COT Decoding (Chain-of-Thought Reasoning without Prompting, Wang, Zhou, 2024)"
        ),
    ],
    title="Decoding Methods Comparison",
    description="Each decoding method's final answer is printed as soon as it is done. Model used: GPT-2.",
)

if __name__ == "__main__":
    interface.launch()