diff --git "a/Tutorials/.ipynb_checkpoints/0_Embeddings-checkpoint.html" "b/Tutorials/.ipynb_checkpoints/0_Embeddings-checkpoint.html" new file mode 100644--- /dev/null +++ "b/Tutorials/.ipynb_checkpoints/0_Embeddings-checkpoint.html" @@ -0,0 +1,14340 @@ + + +
+ + +# Use the trained astroBERT model to generate embedings of text
+# to be used for downstream tasks
+
This tutorial will show you how to load astroBERT and produce text embeddings that can be used on downstream tasks.
+ +# 1 - load models and tokenizer
+
from transformers import AutoTokenizer, AutoModel
+
2022-10-17 12:10:19.355203: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0 ++
# the model path can either be the name of the Huggingface repository
+remote_model_path = 'adsabs/astroBERT'
+# or the local path to the directory containing model weight and tokenizer vocab
+local_model_path = '../'
+
# make sure you load the tokenier with do_lower_case=False
+astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,
+ use_auth_token=True,
+ add_special_tokens=True,
+ do_lower_case=False,
+ )
+
astroBERT_tokenizer
+
PreTrainedTokenizerFast(name_or_path='adsabs/astroBERT', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})+
# automodels: defaults to BertModel
+# it's normal to get warnings as a BertModel will not load the weights used for PreTraining
+astroBERT_automodel = AutoModel.from_pretrained(remote_model_path,
+ use_auth_token=True,
+ )
+
Some weights of the model checkpoint at adsabs/astroBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight'] +- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). +- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). ++
# 2 - make some inference, the outputs are the embeddings
+
# list of strings for which we want embeddings
+strings = ['The Chandra X-ray Observatory (CXO), previously known as the Advanced X-ray Astrophysics Facility (AXAF), is a Flagship-class space telescope launched aboard the Space Shuttle Columbia during STS-93 by NASA on July 23, 1999.',
+ 'Independent lines of evidence from Type Ia supernovae and the CMB imply that the universe today is dominated by a mysterious form of energy known as dark energy, which appears to homogeneously permeate all of space.',
+ 'This work has been developed in the framework of the ‘Darklight’ programme, supported by the European Research Council through an Advanced Research Grant to LG (Project # 291521).'
+ ]
+
+# tokenizer the strings, with padding (needed to process multiple strings efficiently)
+inputs = astroBERT_tokenizer(strings,
+ padding=True,
+ return_tensors='pt'
+ )
+
+# check the shape of the inputs
+print(inputs['input_ids'].shape)
+
torch.Size([3, 54]) ++
# pass the inputs through astroBERT
+import torch
+# no need for gradients, since we are only doing inference
+with torch.no_grad():
+ output = astroBERT_automodel(**inputs,
+ output_hidden_states=False
+ )
+
# BertModel outputs two tensors: last_hidden_state (our embeddings) and pooler_output (to be discarded as it's not meaningful)
+# see https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel.forward
+# embeddings will have shape = (# of strings, size of tokenized strings(padded), 768 (BERT embedding size))
+embeddings = output[0]
+print(embeddings.shape)
+
torch.Size([3, 54, 768]) ++
print(embeddings[0])
+
tensor([[ 0.5546, 0.9121, 0.6550, ..., -0.1925, 0.7077, -0.2405], + [ 0.6252, 0.3175, 1.0899, ..., 0.0576, 0.0529, 0.0603], + [ 0.1803, -0.4567, 1.2688, ..., 0.6026, -0.5718, -0.2060], + ..., + [-0.4397, -0.5334, 1.1682, ..., 0.9541, 0.4046, -0.4756], + [-0.3911, 0.7793, 0.2432, ..., 0.2268, -1.0489, -1.4864], + [-0.4529, -0.7346, 0.0675, ..., -0.3246, -0.2333, -0.6154]]) ++
+