RGMC / infer.py
NikitaSrivatsan
Fixed output text box
866a431
from audiocaptioner import AudioCaptioner
from data_module import AudiostockDataset
from utils import *
def infer(input_filename):
device = get_device(0)
# connect to GCS
gcs = CheckpointManager()
# create and/or load model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', local_files_only=False)
prefix_dim = 512
prefix_length = 10
prefix_length_clip = 10
num_layers = 8
checkpoint = 'checkpoints/ZRIUE-BEST.pt'
model = AudioCaptioner(prefix_length, clip_length=prefix_length_clip, prefix_size=prefix_dim, num_layers=num_layers).to(device)
model.load_state_dict(gcs.get_checkpoint(checkpoint))
print(f'Loaded from {checkpoint}')
model.eval()
# read in the wav file and precompute neighbors
dataset_path = ''
train_dataset = AudiostockDataset(
dataset_path=dataset_path,
train=False,
split='audiostock-train-240k.txt',
factor=1.0,
verbose=False,
file_list=open('audiostock-train-240k.txt', 'r').read().split()
)
print('Reading in file', input_filename)
dataset = AudiostockDataset(
dataset_path=dataset_path,
train=False,
split=None,
factor=1.0,
verbose=False,
file_list=[input_filename] # manually override file list
)
dataset.precompute_neighbors(model, candidate_set=train_dataset)
waveform = dataset.read_wav(input_filename).unsqueeze(0).to(device, dtype=torch.float32)
# predict
with torch.no_grad():
prefix_embed = model.create_prefix(waveform, 1)
tweet_tokens = torch.tensor(preproc(dataset.id2neighbor[os.path.basename(input_filename).split('.')[0]], tokenizer, stop=False), dtype=torch.int64).to(device)[:150]
tweet_embed = model.gpt.transformer.wte(tweet_tokens)
prefix_embed = torch.cat([prefix_embed, tweet_embed.unsqueeze(0)], dim=1)
candidates = generate_beam(model, tokenizer, embed=prefix_embed, beam_size=5)
generated_text = candidates[0]
generated_text = postproc(generated_text)
print('=======================================')
print(generated_text)
return generated_text
if __name__ == '__main__':
infer('../MusicCaptioning/sample_inputs/sisters.mp3')