diff --git "a/Tutorials/.ipynb_checkpoints/0_Embeddings-checkpoint.html" "b/Tutorials/.ipynb_checkpoints/0_Embeddings-checkpoint.html" deleted file mode 100644--- "a/Tutorials/.ipynb_checkpoints/0_Embeddings-checkpoint.html" +++ /dev/null @@ -1,14340 +0,0 @@ - - -
- - -# Use the trained astroBERT model to generate embedings of text
-# to be used for downstream tasks
-
This tutorial will show you how to load astroBERT and produce text embeddings that can be used on downstream tasks.
- -# 1 - load models and tokenizer
-
from transformers import AutoTokenizer, AutoModel
-
2022-10-17 12:10:19.355203: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0 --
# the model path can either be the name of the Huggingface repository
-remote_model_path = 'adsabs/astroBERT'
-# or the local path to the directory containing model weight and tokenizer vocab
-local_model_path = '../'
-
# make sure you load the tokenier with do_lower_case=False
-astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,
- use_auth_token=True,
- add_special_tokens=True,
- do_lower_case=False,
- )
-
astroBERT_tokenizer
-
PreTrainedTokenizerFast(name_or_path='adsabs/astroBERT', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})-
# automodels: defaults to BertModel
-# it's normal to get warnings as a BertModel will not load the weights used for PreTraining
-astroBERT_automodel = AutoModel.from_pretrained(remote_model_path,
- use_auth_token=True,
- )
-
Some weights of the model checkpoint at adsabs/astroBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight'] -- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model). -- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model). --
# 2 - make some inference, the outputs are the embeddings
-
# list of strings for which we want embeddings
-strings = ['The Chandra X-ray Observatory (CXO), previously known as the Advanced X-ray Astrophysics Facility (AXAF), is a Flagship-class space telescope launched aboard the Space Shuttle Columbia during STS-93 by NASA on July 23, 1999.',
- 'Independent lines of evidence from Type Ia supernovae and the CMB imply that the universe today is dominated by a mysterious form of energy known as dark energy, which appears to homogeneously permeate all of space.',
- 'This work has been developed in the framework of the ‘Darklight’ programme, supported by the European Research Council through an Advanced Research Grant to LG (Project # 291521).'
- ]
-
-# tokenizer the strings, with padding (needed to process multiple strings efficiently)
-inputs = astroBERT_tokenizer(strings,
- padding=True,
- return_tensors='pt'
- )
-
-# check the shape of the inputs
-print(inputs['input_ids'].shape)
-
torch.Size([3, 54]) --
# pass the inputs through astroBERT
-import torch
-# no need for gradients, since we are only doing inference
-with torch.no_grad():
- output = astroBERT_automodel(**inputs,
- output_hidden_states=False
- )
-
# BertModel outputs two tensors: last_hidden_state (our embeddings) and pooler_output (to be discarded as it's not meaningful)
-# see https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel.forward
-# embeddings will have shape = (# of strings, size of tokenized strings(padded), 768 (BERT embedding size))
-embeddings = output[0]
-print(embeddings.shape)
-
torch.Size([3, 54, 768]) --
print(embeddings[0])
-
tensor([[ 0.5546, 0.9121, 0.6550, ..., -0.1925, 0.7077, -0.2405], - [ 0.6252, 0.3175, 1.0899, ..., 0.0576, 0.0529, 0.0603], - [ 0.1803, -0.4567, 1.2688, ..., 0.6026, -0.5718, -0.2060], - ..., - [-0.4397, -0.5334, 1.1682, ..., 0.9541, 0.4046, -0.4756], - [-0.3911, 0.7793, 0.2432, ..., 0.2268, -1.0489, -1.4864], - [-0.4529, -0.7346, 0.0675, ..., -0.3246, -0.2333, -0.6154]]) --
-