Pixel-Linguist
/

Pixel-Linguist-v0

@@ -36,82 +36,74 @@ The model might not be optimal for further fine-tuning to do other tasks (such a
 All the training sets involved in our progressive training scheme that we created can be found in tags in meta data. Please refer to the paper for the exact process.
 ## Inference
-First install the package following our Github Repo. Then define the model, the renderer, and other utils.
 ```python
 import torch
 from PIL import Image
 from pixel import (
  AutoConfig,
  PangoCairoTextRenderer,
  PIXELForRepresentation,
  PoolingMode,
  get_attention_mask,
  get_transforms,
  glue_strip_spaces,
- resize_model_embeddings
 )
-POOLING_MODE = "mean"
-SEQ_LEN = 64
-BSZ = 16
-num_labels = 0
-model_name = "Pixel-Linguist/Pixel-Linguist-v0"
-config = AutoConfig.from_pretrained(
- model_name,
- num_labels=num_labels,
- use_auth_token=None
-)
-model = PIXELForRepresentation.from_pretrained(
- model_name,
- config=config,
- pooling_mode=PoolingMode.from_string(POOLING_MODE),
- add_layer_norm=True
-)
-renderer_cls = PangoCairoTextRenderer
-processor = renderer_cls.from_pretrained(
- model_name,
- cache_dir=None,
- revision="main",
- rgb=False,
-)
-processor.max_seq_length = SEQ_LEN
-resize_model_embeddings(model, processor.max_seq_length)
-transforms = get_transforms(
- do_resize=True,
- size=(processor.pixels_per_patch, processor.pixels_per_patch * processor.max_seq_length),
-)
-format_fn = glue_strip_spaces
-def preprocess(texts):
- encodings = [processor(text=format_fn(a)) for a in texts]
- inputs = {}
- inputs['pixel_values'] = [transforms(Image.fromarray(e.pixel_values)) for e in encodings]
- inputs['attention_mask'] = [get_attention_mask(e.num_text_patches, seq_length=64) for e in encodings]
- return inputs
-def image_collator(examples):
- pixel_values = torch.stack(examples['pixel_values'])
- attention_mask = torch.stack(examples['attention_mask'])
- return {
- 'instance': {"pixel_values": pixel_values, "attention_mask": attention_mask},
- }
 ```
 A minumum example to do inference and similarity computation:
 ```python
 texts = ["I love you","I like you"]
-inputs = preprocess(texts)
-inputs = image_collator(inputs)
-inputs = inputs["instance"]
-inputs = {k: v.to(model.device) for k, v in inputs.items()}
-outputs = model(**inputs).logits.detach().cpu()
 print(outputs[0] @ outputs[1].T) # just use dot product because the embeddings are normalized automatically in the model class.
 #tensor(0.9217)
 ```

 All the training sets involved in our progressive training scheme that we created can be found in tags in meta data. Please refer to the paper for the exact process.
 ## Inference
+First install the package following our Github Repo. Then define our PixelLinguist Class as follow.
 ```python
 import torch
 from PIL import Image
 from pixel import (
  AutoConfig,
  PangoCairoTextRenderer,
+ PIXELForSequenceClassification,
  PIXELForRepresentation,
  PoolingMode,
  get_attention_mask,
  get_transforms,
  glue_strip_spaces,
+ resize_model_embeddings,
 )
+from tqdm import tqdm
+class PixelLinguist:
+ def __init__(self, model_name, batch_size = 16, max_seq_length = 64,
+ device=None, keep_mlp = False):
+ if device is not None:
+ self.device = device
+ else:
+ self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
+ self.config = AutoConfig.from_pretrained(model_name, num_labels=0)
+ self.batch_size = batch_size
+ if keep_mlp == True:
+ self.model = PIXELForSequenceClassification.from_pretrained(
+ model_name,
+ config=self.config,
+ pooling_mode=PoolingMode.from_string("mean"),
+ add_layer_norm=True
+ ).to(self.device)
+ else:
+ self.model = PIXELForRepresentation.from_pretrained(
+ model_name,
+ config=self.config,
+ pooling_mode=PoolingMode.from_string("mean"),
+ add_layer_norm=True
+ ).to(self.device)
+ self.processor = PangoCairoTextRenderer.from_pretrained(model_name, rgb=False)
+ self.processor.max_seq_length = max_seq_length
+ resize_model_embeddings(self.model, self.processor.max_seq_length)
+ self.transforms = get_transforms(do_resize=True, size=(self.processor.pixels_per_patch, self.processor.pixels_per_patch * self.processor.max_seq_length))
+ def preprocess(self, texts):
+ encodings = [self.processor(text=glue_strip_spaces(a)) for a in texts]
+ pixel_values = torch.stack([self.transforms(Image.fromarray(e.pixel_values)) for e in encodings])
+ attention_mask = torch.stack([get_attention_mask(e.num_text_patches, seq_length=self.processor.max_seq_length) for e in encodings])
+ return {'pixel_values': pixel_values, 'attention_mask': attention_mask}
+ def encode(self, texts, **kwargs):
+ all_outputs = []
+ for i in tqdm(range(0, len(texts), self.batch_size)):
+ batch_texts = texts[i:i+batch_size]
+ inputs = self.preprocess(batch_texts)
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
+ with torch.no_grad():
+ outputs = self.model(**inputs).logits.detach().cpu()
+ all_outputs.append(outputs)
+ return torch.cat(all_outputs, dim=0)
 ```
 A minumum example to do inference and similarity computation:
 ```python
 texts = ["I love you","I like you"]
+embeddings = model.encode(texts)
 print(outputs[0] @ outputs[1].T) # just use dot product because the embeddings are normalized automatically in the model class.
 #tensor(0.9217)
 ```