Granite Guardian HAP 38M ONNX Model
This repository contains ONNX versions of the Granite Guardian HAP 38M model, including onnx and quantized onnx variants. The models are optimized for efficient inference in production environments.
ibm-granite/granite-guardian-hap-38m
Model Variants
Onnx Model (
guardian_model.onnx
):- Full precision FP32 model
- Best for scenarios requiring maximum accuracy
Quantized Model (
guardian_model_quantized.onnx
):- INT8 quantized model
- Optimized for faster inference and smaller size
- Maintains comparable accuracy to the original model
Model Usage
C# Implementation
Installation
- Create a new .NET project (targeting .NET 8.0 or later)
- Add required NuGet packages:
<PackageReference Include="Microsoft.ML.OnnxRuntime" Version="1.17.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.3" />
Basic Usage
using Microsoft.ML.OnnxRuntime;
using Microsoft.ML.OnnxRuntime.Tensors;
using System.Net.Http;
using Newtonsoft.Json;
public class GuardianModel
{
private readonly InferenceSession _session;
private readonly Dictionary<string, int> _tokenizer;
public GuardianModel(string modelPath, string tokenizerPath)
{
// Initialize ONNX session
_session = new InferenceSession(modelPath);
// Load tokenizer vocabulary
using var client = new HttpClient();
var vocab = client.GetStringAsync(tokenizerPath).Result;
_tokenizer = JsonConvert.DeserializeObject<Dictionary<string, int>>(vocab);
}
private int[] TokenizeText(string text)
{
var words = text.ToLower().Split(' ');
var tokens = new List<int>();
// Add start token
if (_tokenizer.ContainsKey("<s>"))
tokens.Add(_tokenizer["<s>"]);
// Tokenize words
foreach (var word in words)
{
if (_tokenizer.ContainsKey(word))
tokens.Add(_tokenizer[word]);
else
tokens.Add(_tokenizer["<unk>"]);
}
// Add end token
if (_tokenizer.ContainsKey("</s>"))
tokens.Add(_tokenizer["</s>"]);
// Pad sequence
while (tokens.Count < 128)
tokens.Add(_tokenizer["<pad>"]);
if (tokens.Count > 128)
tokens = tokens.Take(128).ToList();
return tokens.ToArray();
}
public (int Prediction, float Confidence) Predict(string text)
{
// Tokenize input
var tokens = TokenizeText(text);
// Create input tensor
var inputDims = new[] { 1, tokens.Length };
var inputTensor = new DenseTensor<long>(inputDims);
var attentionMask = new DenseTensor<long>(inputDims);
for (int i = 0; i < tokens.Length; i++)
{
inputTensor[0, i] = tokens[i];
attentionMask[0, i] = tokens[i] == _tokenizer["<pad>"] ? 0 : 1;
}
// Create input data
var inputs = new List<NamedOnnxValue>
{
NamedOnnxValue.CreateFromTensor("input_ids", inputTensor),
NamedOnnxValue.CreateFromTensor("attention_mask", attentionMask)
};
// Run inference
using var outputs = _session.Run(inputs);
var results = outputs.First().AsTensor<float>();
// Get prediction and confidence
var prediction = results.ToArray()
.Select((value, index) => new { Value = value, Index = index })
.OrderByDescending(x => x.Value)
.First();
return (prediction.Index, prediction.Value);
}
}
// Example usage
var model = new GuardianModel(
"path/to/guardian_model.onnx",
"https://huggingface.co/KantiArumilli/granite-guardian-hap-38m-onnx/raw/main/tokenizer/vocab.json"
);
var (prediction, confidence) = model.Predict("Your text here");
Console.WriteLine($"Prediction: {prediction}, Confidence: {confidence:F4}");
Important Implementation Notes
Special Tokens:
- Start token:
<s>
- End token:
</s>
- Unknown token:
<unk>
- Padding token:
<pad>
- Start token:
Input Processing:
- Maximum sequence length: 128 tokens
- Right-side padding and truncation
- Case-insensitive tokenization
- Attention mask: 1 for real tokens, 0 for padding
Model Inputs:
input_ids
: Token IDs (shape: [batch_size, sequence_length])attention_mask
: Attention mask (shape: [batch_size, sequence_length])
Model Output:
- Logits for each class
- Use argmax for final prediction
- Use max value for confidence score
Python Implementation
Installation
pip install transformers onnxruntime numpy
Basic Usage
import numpy as np
import onnxruntime as ort
from transformers import PreTrainedTokenizerFast
from huggingface_hub import hf_hub_download
def load_model(model_path, use_gpu=False):
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider'] if use_gpu else ['CPUExecutionProvider']
return ort.InferenceSession(model_path, providers=providers)
def load_tokenizer(model_id):
# Download tokenizer files
tokenizer_file = hf_hub_download(
repo_id=model_id,
filename="tokenizer/tokenizer.json"
)
# Initialize tokenizer with special tokens
tokenizer = PreTrainedTokenizerFast(
tokenizer_file=tokenizer_file,
padding_side="right",
truncation_side="right"
)
# Set special tokens
special_tokens = {
"pad_token": "<pad>",
"eos_token": "</s>",
"bos_token": "<s>",
"unk_token": "<unk>"
}
tokenizer.add_special_tokens(special_tokens)
return tokenizer
def predict(text, model, tokenizer):
# Tokenize input
inputs = tokenizer(
text,
padding=True,
truncation=True,
max_length=128,
return_tensors="np"
)
# Run inference
onnx_inputs = {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"]
}
outputs = model.run(None, onnx_inputs)
# Process output
logits = outputs[0]
prediction = np.argmax(logits, axis=1)[0]
confidence = float(np.max(logits, axis=1)[0])
return prediction, confidence
# Example usage
model_id = "KantiArumilli/granite-guardian-hap-38m-onnx"
model_file = "guardian_model.onnx" # or "guardian_model_quantized.onnx"
# Download model
model_path = hf_hub_download(repo_id=model_id, filename=model_file)
# Initialize model and tokenizer
model = load_model(model_path)
tokenizer = load_tokenizer(model_id)
# Make prediction
text = "Your text here"
prediction, confidence = predict(text, model, tokenizer)
print(f"Prediction: {prediction}, Confidence: {confidence:.4f}")
Presented by Mr. Kanti Arumilli Founder & CEO ALight Technology And Services Limited and ALight Technologies USA Inc.
Inference Providers
NEW
This model is not currently available via any of the supported third-party Inference Providers, and
HF Inference API was unable to determine this model's library.
Model tree for KantiArumilli/granite-guardian-hap-38m-onnx
Base model
ibm-granite/granite-guardian-hap-38m