Spaces:
Runtime error
Runtime error
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from peft import PeftModel | |
import gradio as gr | |
# Set model name and path | |
model_name = "ybelkada/falcon-7b-sharded-bf16" | |
fine_tuned_model = "mounseflit/falcon-7b-marrakech" | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
# Load base model (CPU-only, no quantization) | |
base_model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
device_map="auto", # Auto device map for CPU | |
offload_folder="offload", # Offload large parts of the model to disk to save memory | |
offload_state_dict=True # Enable state dict offloading to reduce memory usage | |
) | |
# Load the fine-tuned LoRA model on top of the base model | |
model = PeftModel.from_pretrained(base_model, fine_tuned_model) | |
# Ensure the model is in evaluation mode | |
model.eval() | |
# Function to generate text | |
def generate_text(prompt): | |
inputs = tokenizer(prompt, return_tensors="pt", max_length=50, truncation=True).to("cpu") # Reduce input length | |
with torch.no_grad(): | |
outputs = model.generate(**inputs, max_length=100) # Reduce output length | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Create Gradio interface | |
iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Falcon 7B Lite (CPU)") | |
# Launch the app | |
iface.launch() | |