Add transformers.js tag + sample code (#10)
Browse files- Add transformers.js tag + sample code (6afabdca55c609a2a76df198365bd4aa05528559)
Co-authored-by: Joshua <[email protected]>
README.md
CHANGED
|
@@ -5,6 +5,7 @@ tags:
|
|
| 5 |
- mteb
|
| 6 |
- clip
|
| 7 |
- vision
|
|
|
|
| 8 |
language: en
|
| 9 |
inference: false
|
| 10 |
license: apache-2.0
|
|
@@ -77,6 +78,44 @@ print(cos_sim(text_embeddings[1], image_embeddings[0])) # text-image cross-modal
|
|
| 77 |
print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
|
| 78 |
```
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
## Performance
|
| 82 |
|
|
|
|
| 5 |
- mteb
|
| 6 |
- clip
|
| 7 |
- vision
|
| 8 |
+
- transformers.js
|
| 9 |
language: en
|
| 10 |
inference: false
|
| 11 |
license: apache-2.0
|
|
|
|
| 78 |
print(cos_sim(text_embeddings[1], image_embeddings[1])) # text-image cross-modal similarity
|
| 79 |
```
|
| 80 |
|
| 81 |
+
3. JavaScript developers can use Jina CLIP via the [Transformers.js](https://huggingface.co/docs/transformers.js) library. Note that to use this model, you need to install Transformers.js [v3](https://github.com/xenova/transformers.js/tree/v3) from source using `npm install xenova/transformers.js#v3`.
|
| 82 |
+
|
| 83 |
+
```js
|
| 84 |
+
import { AutoTokenizer, CLIPTextModelWithProjection, AutoProcessor, CLIPVisionModelWithProjection, RawImage, cos_sim } from '@xenova/transformers';
|
| 85 |
+
|
| 86 |
+
// Load tokenizer and text model
|
| 87 |
+
const tokenizer = await AutoTokenizer.from_pretrained('jinaai/jina-clip-v1');
|
| 88 |
+
const text_model = await CLIPTextModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
|
| 89 |
+
|
| 90 |
+
// Load processor and vision model
|
| 91 |
+
const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
|
| 92 |
+
const vision_model = await CLIPVisionModelWithProjection.from_pretrained('jinaai/jina-clip-v1');
|
| 93 |
+
|
| 94 |
+
// Run tokenization
|
| 95 |
+
const texts = ['Bridge close-shot', 'Bridge in far away'];
|
| 96 |
+
const text_inputs = tokenizer(texts, { padding: true, truncation: true });
|
| 97 |
+
|
| 98 |
+
// Compute text embeddings
|
| 99 |
+
const { text_embeds } = await text_model(text_inputs);
|
| 100 |
+
|
| 101 |
+
// Read images and run processor
|
| 102 |
+
const urls = [
|
| 103 |
+
'https://fastly.picsum.photos/id/74/4288/2848.jpg?hmac=q02MzzHG23nkhJYRXR-_RgKTr6fpfwRgcXgE0EKvNB8',
|
| 104 |
+
'https://fastly.picsum.photos/id/84/1280/848.jpg?hmac=YFRYDI4UsfbeTzI8ZakNOR98wVU7a-9a2tGF542539s',
|
| 105 |
+
];
|
| 106 |
+
const image = await Promise.all(urls.map(url => RawImage.read(url)));
|
| 107 |
+
const image_inputs = await processor(image);
|
| 108 |
+
|
| 109 |
+
// Compute vision embeddings
|
| 110 |
+
const { image_embeds } = await vision_model(image_inputs);
|
| 111 |
+
|
| 112 |
+
// Compute similarities
|
| 113 |
+
console.log(cos_sim(text_embeds[0].data, text_embeds[1].data)) // text embedding similarity
|
| 114 |
+
console.log(cos_sim(text_embeds[0].data, image_embeds[0].data)) // text-image cross-modal similarity
|
| 115 |
+
console.log(cos_sim(text_embeds[0].data, image_embeds[1].data)) // text-image cross-modal similarity
|
| 116 |
+
console.log(cos_sim(text_embeds[1].data, image_embeds[0].data)) // text-image cross-modal similarity
|
| 117 |
+
console.log(cos_sim(text_embeds[1].data, image_embeds[1].data)) // text-image cross-modal similarity
|
| 118 |
+
```
|
| 119 |
|
| 120 |
## Performance
|
| 121 |
|