mounseflit's picture
Update app.py
ffbc1c3 verified
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import gradio as gr
# Set model name and path
model_name = "ybelkada/falcon-7b-sharded-bf16"
fine_tuned_model = "mounseflit/falcon-7b-marrakech"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Load base model (CPU-only, no quantization)
base_model = AutoModelForCausalLM.from_pretrained(
model_name,
device_map="auto", # Auto device map for CPU
offload_folder="offload", # Offload large parts of the model to disk to save memory
offload_state_dict=True # Enable state dict offloading to reduce memory usage
)
# Load the fine-tuned LoRA model on top of the base model
model = PeftModel.from_pretrained(base_model, fine_tuned_model)
# Ensure the model is in evaluation mode
model.eval()
# Function to generate text
def generate_text(prompt):
inputs = tokenizer(prompt, return_tensors="pt", max_length=50, truncation=True).to("cpu") # Reduce input length
with torch.no_grad():
outputs = model.generate(**inputs, max_length=100) # Reduce output length
return tokenizer.decode(outputs[0], skip_special_tokens=True)
# Create Gradio interface
iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite (CPU)")
# Launch the app
iface.launch()