import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer from vllm import LLMEngine, SamplingParams # Load the model and tokenizer from Hugging Face model_name = "Qwen/Qwen2-7B" tokenizer = AutoTokenizer.from_pretrained(model_name) engine = LLMEngine(model=model_name) def generate_response(prompt, max_tokens, temperature, top_p): # Tokenize the prompt inputs = tokenizer(prompt, return_tensors="pt") # Define sampling parameters sampling_params = SamplingParams( max_tokens=max_tokens, temperature=temperature, top_p=top_p, ) # Generate text using vLLM output = engine.generate(inputs["input_ids"], sampling_params) # Decode the generated tokens to text generated_text = tokenizer.decode(output[0]["token_ids"], skip_special_tokens=True) return generated_text # Gradio UI with gr.Blocks() as demo: gr.Markdown("# 🚀 Hugging Face Integration with vLLM") gr.Markdown("Generate text using the vLLM integration with Hugging Face models.") with gr.Row(): with gr.Column(): prompt_input = gr.Textbox( label="Prompt", placeholder="Enter your prompt here...", lines=3, ) max_tokens = gr.Slider( label="Max Tokens", minimum=10, maximum=500, value=100, step=10, ) temperature = gr.Slider( label="Temperature", minimum=0.1, maximum=1.0, value=0.7, step=0.1, ) top_p = gr.Slider( label="Top P", minimum=0.1, maximum=1.0, value=0.9, step=0.1, ) submit_button = gr.Button("Generate") with gr.Column(): output_text = gr.Textbox( label="Generated Text", lines=10, interactive=False, ) submit_button.click( generate_response, inputs=[prompt_input, max_tokens, temperature, top_p], outputs=output_text, ) # Launch the app demo.launch()