Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from vllm import LLMEngine, SamplingParams | |
# Load the model and tokenizer from Hugging Face | |
model_name = "Qwen/Qwen2-7B" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
engine = LLMEngine(model=model_name) | |
def generate_response(prompt, max_tokens, temperature, top_p): | |
# Tokenize the prompt | |
inputs = tokenizer(prompt, return_tensors="pt") | |
# Define sampling parameters | |
sampling_params = SamplingParams( | |
max_tokens=max_tokens, | |
temperature=temperature, | |
top_p=top_p, | |
) | |
# Generate text using vLLM | |
output = engine.generate(inputs["input_ids"], sampling_params) | |
# Decode the generated tokens to text | |
generated_text = tokenizer.decode(output[0]["token_ids"], skip_special_tokens=True) | |
return generated_text | |
# Gradio UI | |
with gr.Blocks() as demo: | |
gr.Markdown("# π Hugging Face Integration with vLLM") | |
gr.Markdown("Generate text using the vLLM integration with Hugging Face models.") | |
with gr.Row(): | |
with gr.Column(): | |
prompt_input = gr.Textbox( | |
label="Prompt", | |
placeholder="Enter your prompt here...", | |
lines=3, | |
) | |
max_tokens = gr.Slider( | |
label="Max Tokens", | |
minimum=10, | |
maximum=500, | |
value=100, | |
step=10, | |
) | |
temperature = gr.Slider( | |
label="Temperature", | |
minimum=0.1, | |
maximum=1.0, | |
value=0.7, | |
step=0.1, | |
) | |
top_p = gr.Slider( | |
label="Top P", | |
minimum=0.1, | |
maximum=1.0, | |
value=0.9, | |
step=0.1, | |
) | |
submit_button = gr.Button("Generate") | |
with gr.Column(): | |
output_text = gr.Textbox( | |
label="Generated Text", | |
lines=10, | |
interactive=False, | |
) | |
submit_button.click( | |
generate_response, | |
inputs=[prompt_input, max_tokens, temperature, top_p], | |
outputs=output_text, | |
) | |
# Launch the app | |
demo.launch() |