import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, dot, softmax, } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.5.0"; // Reference the elements that we will need const status = document.getElementById("status"); const container = document.getElementById("container"); const video = document.getElementById("video"); const labelsInput = document.getElementById("labels"); const templateInput = document.getElementById("template"); const overlay = document.getElementById("overlay"); status.textContent = "Loading model (88MB)..."; const model_id = "Xenova/mobileclip_s0"; let tokenizer, text_model, processor, vision_model; try { // Load tokenizer and text model tokenizer = await AutoTokenizer.from_pretrained(model_id); text_model = await CLIPTextModelWithProjection.from_pretrained(model_id, { device: "wasm", dtype: "q8", }); // Load processor and vision model processor = await AutoProcessor.from_pretrained(model_id); vision_model = await CLIPVisionModelWithProjection.from_pretrained(model_id, { device: "webnn", dtype: "fp32", }); } catch (err) { console.error(err); status.textContent = err.message; alert(err.message); throw err; } labelsInput.disabled = false; templateInput.disabled = false; status.textContent = "Ready"; // See `model.logit_scale` parameter of original model const exp_logit_scale = Math.exp(4.6052); const IMAGE_SIZE = 224; const canvas = document.createElement("canvas"); canvas.width = canvas.height = IMAGE_SIZE; const context = canvas.getContext("2d", { willReadFrequently: true }); let isProcessing = false; let previousTime; let textEmbeddings; let prevTextInputs; let prevTemplate; let labels; function onFrameUpdate() { if (!isProcessing) { isProcessing = true; (async function () { // If text inputs have changed, update the embeddings if ( prevTextInputs !== labelsInput.value || prevTemplate !== templateInput.value ) { textEmbeddings = null; prevTextInputs = labelsInput.value; prevTemplate = templateInput.value; labels = prevTextInputs.split(/\s*,\s*/).filter((x) => x); if (labels.length > 0) { const texts = labels.map((x) => templateInput.value.replaceAll("{}", x), ); const text_inputs = tokenizer(texts, { padding: "max_length", // NB: the model requires max_length padding truncation: true, }); // Compute embeddings const { text_embeds } = await text_model(text_inputs); textEmbeddings = text_embeds.normalize().tolist(); } else { overlay.innerHTML = ""; } } if (textEmbeddings) { // Read the current frame from the video context.drawImage(video, 0, 0, IMAGE_SIZE, IMAGE_SIZE); const pixelData = context.getImageData( 0, 0, IMAGE_SIZE, IMAGE_SIZE, ).data; const image = new RawImage(pixelData, IMAGE_SIZE, IMAGE_SIZE, 4); const image_inputs = await processor(image); // Compute embeddings const { image_embeds } = await vision_model(image_inputs); const imageEmbedding = image_embeds.normalize().tolist()[0]; // Compute similarity const similarities = textEmbeddings.map( (x) => dot(x, imageEmbedding) * exp_logit_scale, ); const sortedIndices = softmax(similarities) .map((x, i) => [x, i]) .sort((a, b) => b[0] - a[0]); // Update UI overlay.innerHTML = ""; for (const [score, index] of sortedIndices) { overlay.appendChild( document.createTextNode(`${labels[index]}: ${score.toFixed(2)}`), ); overlay.appendChild(document.createElement("br")); } } if (previousTime !== undefined) { const fps = 1000 / (performance.now() - previousTime); status.textContent = `FPS: ${fps.toFixed(2)}`; } previousTime = performance.now(); isProcessing = false; })(); } window.requestAnimationFrame(onFrameUpdate); } // Start the video stream navigator.mediaDevices .getUserMedia( { video: true }, // Ask for video ) .then((stream) => { // Set up the video and canvas elements. video.srcObject = stream; video.play(); const videoTrack = stream.getVideoTracks()[0]; const { width, height } = videoTrack.getSettings(); video.width = width; video.height = height; // Set container width and height depending on the image aspect ratio const ar = width / height; const [cw, ch] = ar > 720 / 405 ? [720, 720 / ar] : [405 * ar, 405]; container.style.width = `${cw}px`; container.style.height = `${ch}px`; // Start the animation loop window.requestAnimationFrame(onFrameUpdate); }) .catch((error) => { alert(error); });