import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import logging import sys import gc import time from contextlib import contextmanager # Set up logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[logging.StreamHandler(sys.stdout)] ) logger = logging.getLogger(__name__) @contextmanager def timer(description: str): start = time.time() yield elapsed = time.time() - start logger.info(f"{description}: {elapsed:.2f} seconds") def log_system_info(): """Log system information for debugging""" logger.info(f"Python version: {sys.version}") logger.info(f"PyTorch version: {torch.__version__}") logger.info(f"Device: {torch.device('cuda' if torch.cuda.is_available() else 'cpu')}") if torch.cuda.is_available(): logger.info(f"GPU: {torch.cuda.get_device_name(0)}") logger.info(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB") print("Starting application...") log_system_info() try: print("Loading model and tokenizer...") model_id = "htigenai/finetune_test_2_4bit" with timer("Loading tokenizer"): tokenizer = AutoTokenizer.from_pretrained( model_id, trust_remote_code=True ) logger.info("Tokenizer loaded successfully") with timer("Loading model"): model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=True, use_cache=True, # Enable KV cache for faster generation ) # Optimize model for inference model.eval() if torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logger.info("Model loaded successfully") def generate_text(prompt, max_tokens=200, temperature=0.7): """Generate text based on the input prompt.""" try: logger.info(f"Starting generation for prompt: {prompt[:50]}...") with timer("Tokenization"): inputs = tokenizer( prompt, return_tensors="pt", padding=True, truncation=True, max_length=256 # Reduced for faster processing ) inputs = {k: v.to(model.device) for k, v in inputs.items()} with timer("Generation"): with torch.inference_mode(), torch.cuda.amp.autocast(enabled=True): outputs = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=0.95, do_sample=True, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, repetition_penalty=1.1, num_beams=1, # Disable beam search for speed early_stopping=True, use_cache=True ) with timer("Decoding"): generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) logger.info("Text generation completed successfully") # Clean up with timer("Cleanup"): gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() return generated_text except Exception as e: logger.error(f"Error during generation: {str(e)}") return f"Error during generation: {str(e)}" # Create Gradio interface with performance controls iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox( lines=3, placeholder="Enter your prompt here...", label="Input Prompt" ), gr.Slider( minimum=50, maximum=200, value=100, step=10, label="Max Tokens" ), gr.Slider( minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature" ) ], outputs=gr.Textbox( label="Generated Response", lines=5 ), title="HTIGENAI Reflection Analyzer - Test", description="Enter a prompt and adjust generation parameters for speed/quality trade-off. Lower max tokens and higher temperature will generate faster.", examples=[ ["What are your thoughts about cats?", 100, 0.7], ["Write a short story about a magical forest", 150, 0.8], ["Explain quantum computing to a 5-year-old", 75, 0.5], ] ) # Launch the interface iface.launch( share=False, debug=True, server_name="0.0.0.0" ) except Exception as e: logger.error(f"Application startup failed: {str(e)}") raise