How to use?

This model is used in optillm to route between the various approaches based on the prompt.

To use the model with optillm you can just prepend router to the model name. E.g. if we set router-gpt-4o-mini as the model, it will use the gpt-4o-mini as the base model.

Otherwise, refer to the code in router-plugin to see how to use this model for classification.

Usage

To use the model directly you will need to use our OptILMClassifier class as we added additional layers to the base model. The additional effort_encoder is used to take into account the number of tokens a given approach consumes. Also, note the mapping of the returned index to the APPROACHES list as shown below.

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer, AutoConfig
from huggingface_hub import hf_hub_download
from safetensors import safe_open
from safetensors.torch import load_model
from transformers import AutoTokenizer, AutoModel

# Constants
MAX_LENGTH = 512
APPROACHES = ["none", "mcts", "bon", "moa", "rto", "z3", "self_consistency", "pvg", "rstar", "cot_reflection", "plansearch", "leap", "re2"]
MODEL_NAME = "codelion/optillm-bert-uncased"

class OptILMClassifier(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.base_model = base_model
        self.effort_encoder = nn.Sequential(
            nn.Linear(1, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )
        self.classifier = nn.Linear(base_model.config.hidden_size + 64, num_labels)

    def forward(self, input_ids, attention_mask, effort):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0]  # Shape: (batch_size, hidden_size)
        effort_encoded = self.effort_encoder(effort.unsqueeze(1))  # Shape: (batch_size, 64)
        combined_input = torch.cat((pooled_output, effort_encoded), dim=1)
        logits = self.classifier(combined_input)
        return logits

def load_optillm_model():
    device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
    # Load the base model
    base_model = AutoModel.from_pretrained("google-bert/bert-large-uncased")
    # Create the OptILMClassifier
    model = OptILMClassifier(base_model, num_labels=len(APPROACHES))  
    model.to(device)
    # Download the safetensors file
    safetensors_path = hf_hub_download(repo_id=MODEL_NAME, filename="model.safetensors")
    # Load the state dict from the safetensors file
    load_model(model, safetensors_path)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    return model, tokenizer, device

def preprocess_input(tokenizer, system_prompt, initial_query):
    combined_input = f"{system_prompt}\n\nUser: {initial_query}"
    encoding = tokenizer.encode_plus(
        combined_input,
        add_special_tokens=True,
        max_length=MAX_LENGTH,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoding['input_ids'], encoding['attention_mask']

def predict_approach(model, input_ids, attention_mask, device, effort=0.7):
    model.eval()
    with torch.no_grad():
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        effort_tensor = torch.tensor([effort], dtype=torch.float).to(device)
        
        logits = model(input_ids, attention_mask=attention_mask, effort=effort_tensor)
        probabilities = F.softmax(logits, dim=1)
        predicted_approach_index = torch.argmax(probabilities, dim=1).item()
        confidence = probabilities[0][predicted_approach_index].item()
    
    return APPROACHES[predicted_approach_index], confidence

You can now use the predict_approach method to get the predicted approach as follows:

# Load the trained model
router_model, tokenizer, device = load_optillm_model()

# Preprocess the input
input_ids, attention_mask = preprocess_input(tokenizer, system_prompt, initial_query)

# Predict the best approach
predicted_approach, _ = predict_approach(router_model, input_ids, attention_mask, device)

print(f"Router predicted approach: {predicted_approach}")

Citation

If you use this in your work, please cite:

@software{optillm,
  title = {Optillm: Optimizing inference proxy for LLMs},
  author = {Asankhaya Sharma},
  year = {2024},
  publisher = {GitHub},
  url = {https://github.com/codelion/optillm}
}

codelion
/

optillm-bert-uncased

How to use?

Usage

Citation

Model tree for codelion/optillm-bert-uncased

Dataset used to train codelion/optillm-bert-uncased