sachinkidzure's picture
initial (#1)
135b069 verified
raw
history blame
1.02 kB
import torch
from transformers import PreTrainedModel, XLMRobertaConfig, XLMRobertaModel
class MCLIPConfig(XLMRobertaConfig):
model_type = "M-CLIP"
def __init__(self, transformerDimSize=1024, imageDimSize=768, **kwargs):
self.transformerDimensions = transformerDimSize
self.numDims = imageDimSize
super().__init__(**kwargs)
class MultilingualCLIP(PreTrainedModel):
config_class = MCLIPConfig
def __init__(self, config, *args, **kwargs):
super().__init__(config, *args, **kwargs)
self.transformer = XLMRobertaModel(config)
self.LinearTransformation = torch.nn.Linear(
in_features=config.transformerDimensions, out_features=config.numDims
)
def forward(self, input_ids, attention_mask):
embs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)[0]
embs2 = (embs * attention_mask.unsqueeze(2)).sum(dim=1) / attention_mask.sum(dim=1)[:, None]
return self.LinearTransformation(embs2), embs