# NLP demo software by HyperbeeAI

Copyrights © 2023 Hyperbee.AI Inc. All rights reserved. hello@hyperbee.ai 

### Deployment

This notebook acts as the serial terminal that we use in the ai85 translation demo.

- load parameter set
- run a test on the PC to determine what to expect from the chip
- run test on the chip via serial terminal on PC import torch, random
import numpy as np
import torch.nn as nn
from torchtext.legacy.datasets import TranslationDataset
from torchtext.legacy.data import Field, BucketIterator
from utils import tokenize_es, tokenize_en, tokenizer_es, tokenizer_en, TRG_PAD_IDX, \
    translate_sentence, calculate_bleu, license_statement
from models import encoder, decoder, seq2seq
from dataloader import NewsDataset

import serial All rights reserved. hello@hyperbee.ai\n", "\n" ] } ], "source": [ "import torch, random\n", "import numpy as np\n", "import torch.nn as nn\n", "from torchtext.legacy.datasets import TranslationDataset\n", "from torchtext.legacy.data import Field, BucketIterator\n", "from utils import tokenize_es, tokenize_en, tokenizer_es, tokenizer_en, TRG_PAD_IDX, \\\n", " translate_sentence, calculate_bleu, license_statement\n", "from models import encoder, decoder, seq2seq\n", "from dataloader import NewsDataset\n", "\n", "import serial" ] }, { "cell_type": "code", "execution_count": 2, "id": "9966ccad", "metadata": {}, "outputs": [], "source": [ "SEED = 1234\n", "random.seed(SEED)\n", "torch.manual_seed(SEED)\n", "torch.cuda.manual_seed(SEED)\n", "torch.backends.cudnn.deterministic = True\n", "BATCH_SIZE = 48" ] }, { "cell_type": "code", "execution_count": 3, "id": "6d864c26", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Working with device: cuda\n" ] } ], "source": [ "SRC = Field(tokenize = tokenize_es, \n", " init_token = tokenizer_es.token_to_id(\"\"), \n", " eos_token = tokenizer_es.token_to_id(\"\"), \n", " pad_token = tokenizer_es.token_to_id(\"\"),\n", " unk_token = tokenizer_es.token_to_id(\"\"),\n", " use_vocab = False,\n", " batch_first = True)\n", "\n", "TRG = Field(tokenize = tokenize_en, \n", " init_token = tokenizer_en.token_to_id(\"\"), \n", " eos_token = tokenizer_en.token_to_id(\"\"), \n", " pad_token = tokenizer_en.token_to_id(\"\"),\n", " unk_token = tokenizer_en.token_to_id(\"\"),\n", " use_vocab = False,\n", " batch_first = True)\n", "\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "#device = 'cpu'\n", "print(\"Working with device:\", device)" ] }, { "cell_type": "code", "execution_count": 4, "id": "7f1f2efb", "metadata": {}, "outputs": [], "source": [ "train_data, valid_data, test_data = NewsDataset.splits(exts=('.es', '.en'), fields=(SRC, TRG))\n", "train_iterator, valid_iterator, test_iterator = BucketIterator.splits(\n", " (train_data, valid_data, test_data),\n", " batch_size = BATCH_SIZE,\n", " device = device)" ] }, { "cell_type": "code", "execution_count": 5, "id": "ccd6c1fc", "metadata": {}, "outputs": [], "source": [ "enc = encoder(device)\n", "dec = decoder(device, TRG_PAD_IDX)\n", "model = seq2seq(enc, dec)" ] }, { "cell_type": "code", "execution_count": 6, "id": "6ae348e3", "metadata": {}, "outputs": [], "source": [ "trained_checkpoint = \"assets/es2en_hw_cp6.pt\"\n", "model.load_state_dict(torch.load(trained_checkpoint, map_location=device), strict=False);\n", "model.to(device);" ] }, { "cell_type": "markdown", "id": "ddb1a23b", "metadata": {}, "source": [ "### serial conversion functions" ] }, { "cell_type": "code", "execution_count": 7, "id": "534e72f2", "metadata": {}, "outputs": [], "source": [ "def singlepass64_tensor2serial(seq_length, tensor):\n", " data = tensor.cpu().detach().numpy();\n", " char_array = '';\n", "\n", " i=0;\n", " while i < 64:\n", " for j in range(0,seq_length):\n", " ch3 = data[0,i+3,j].astype('int8')\n", " ch2 = data[0,i+2,j].astype('int8')\n", " ch1 = data[0,i+1,j].astype('int8')\n", " ch0 = data[0,i+0,j].astype('int8')\n", "\n", " # 2s complements\n", " val3 = \"{0:#0{1}x}\".format(int(np.binary_repr(ch3, width=8), 2),4)\n", " val2 = \"{0:#0{1}x}\".format(int(np.binary_repr(ch2, width=8), 2),4)\n", " val1 = \"{0:#0{1}x}\".format(int(np.binary_repr(ch1, width=8), 2),4)\n", " val0 = \"{0:#0{1}x}\".format(int(np.binary_repr(ch0, width=8), 2),4)\n", "\n", " char_array += val3[2:] + val2[2:] + val1[2:] + val0[2:]\n", "\n", " i=i+4\n", " \n", " return char_array\n", "\n", "def twos_comp(val, bits):\n", " if (val & (1 << (bits - 1))) != 0:\n", " val = val - (1 << bits)\n", " return val\n", "\n", "def tensor_fromserial_singlepass64(char_array, seq_length, typetensor):\n", " out_tensor = torch.zeros_like(typetensor)\n", " i=0;\n", " while i < 64:\n", " for j in range(0, seq_length):\n", " cursor = (i*seq_length*2 + j*8); # seq_length*2 because we use 2 characters per element due to pyserial \\CR \\LF issue\n", " word = char_array[cursor : cursor+8];\n", " \n", " # 2s complements\n", " val3 = twos_comp(int(word[0:2],16), 8)\n", " val2 = twos_comp(int(word[2:4],16), 8)\n", " val1 = twos_comp(int(word[4:6],16), 8)\n", " val0 = twos_comp(int(word[6:8],16), 8)\n", " \n", " out_tensor[0,i+3,j] = val3;\n", " out_tensor[0,i+2,j] = val2;\n", " out_tensor[0,i+1,j] = val1;\n", " out_tensor[0,i+0,j] = val0;\n", " \n", " i=i+4\n", "\n", " return out_tensor\n", "\n", "def widemode_twos_comp(val, bits):\n", " if (val & (1 << (bits - 1))) != 0:\n", " val = ((val - (1 << bits)) >> 5) + 1\n", " return (val >> 5)\n", "\n", "def tensor_fromserial_widemode64(char_array, seq_length, typetensor):\n", " out_tensor = torch.zeros_like(typetensor)\n", " i=0;\n", " while i < 64:\n", " for j in range(0, seq_length):\n", " cursor = (i*seq_length*8 + j*32); # seq_length*8 now because we use 8 characters per element, same pyserial issue\n", " word = char_array[cursor : cursor+32];\n", " \n", " # 2s complements\n", " val0 = twos_comp(int(word[0:8],16), 32)\n", " val1 = twos_comp(int(word[8:16],16), 32)\n", " val2 = twos_comp(int(word[16:24],16), 32)\n", " val3 = twos_comp(int(word[24:32],16), 32)\n", " \n", " out_tensor[0,i+0,j] = val0;\n", " out_tensor[0,i+1,j] = val1;\n", " out_tensor[0,i+2,j] = val2;\n", " out_tensor[0,i+3,j] = val3;\n", " \n", " i=i+4\n", "\n", " return out_tensor" ] }, { "cell_type": "markdown", "id": "f248bc1d", "metadata": {}, "source": [ "## Test" ] }, { "cell_type": "markdown", "id": "76d11d80", "metadata": {}, "source": [ "### choose id of example" ] }, { "cell_type": "code", "execution_count": 8, "id": "cdbfd418", "metadata": {}, "outputs": [], "source": [ "example_idx = 120" ] }, { "cell_type": "markdown", "id": "26e82b50", "metadata": {}, "source": [ "### on PC" ] }, { "cell_type": "code", "execution_count": 9, "id": "250dcc52", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "trg = but this won ’ t be the last answer , although for the time being it will drive corporate restructuring and the managerial mind .\n", "\n", "predicted trg = but this will not be the latest response , though it will now be the central force of corporate restructuring and managerial thinking .\n", "\n", "src = pero esto no será la última respuesta , aunque por ahora será la fuerza central de la reestructuración corporativa y el pensamiento gerencial .\n", "\n" ] } ], "source": [ "model.to(device)\n", "src = vars(test_data.examples[example_idx])['src']\n", "trg = tokenizer_en.decode(vars(test_data.examples[example_idx])['trg'], skip_special_tokens=False)\n", "print(f'trg = {trg}')\n", "print(\"\")\n", "translation = translate_sentence(src, SRC, TRG, model, device)\n", "print(f'predicted trg = {translation}')\n", "print(\"\")\n", "src = tokenizer_es.decode(src, skip_special_tokens=False)\n", "print(f'src = {src}')\n", "print(\"\")" ] }, { "cell_type": "markdown", "id": "10e43fe8", "metadata": {}, "source": [ "### on chip" ] }, { "cell_type": "code", "execution_count": 10, "id": "b7aa9adc", "metadata": {}, "outputs": [], "source": [ "enc_pre = model.encoder.pre.to(device)\n", "dec_pre = model.decoder.pre.to(device)\n", "dec_i2w = model.decoder.fff.to(device)\n", "\n", "src = vars(test_data.examples[example_idx])['src']\n", "trg = tokenizer_en.decode(vars(test_data.examples[example_idx])['trg'], skip_special_tokens=False)" ] }, { "cell_type": "markdown", "id": "738e668a", "metadata": {}, "source": [ "**MARK**\n", "\n", "The below cell starts running a serial terminal on this notebook. First run this cell, and when it says \"waiting for ai85\", load the \"assets/demo.elf\" program onto the ai85 chip, and start running it (type c in gdb). This should trigger the terminal here, and operation should resume normally.

The cell is designed to translate a single sentence. All rights reserved. hello@hyperbee.ai