Builds a database of vector embeddings from list of abstracts

## Some Setup

In [None]:
!pip install transformers==4.28.0
!pip install -U sentence-transformers
!pip install datasets
!pip install langchain
!pip install torch
!pip install faiss-cpu

In [None]:
import os
import shutil

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification

m_tokenizer = AutoTokenizer.from_pretrained("biodatlab/MIReAD-Neuro-Large")
m_model = BertForSequenceClassification.from_pretrained("biodatlab/MIReAD-Neuro-Large")
miread_bundle = (m_tokenizer,m_model)

In [None]:
def create_lbert_embed(sents,bundle):
  tokenizer = bundle[0]
  model = bundle[1]
  model.cuda()
  tokens = tokenizer(sents,padding=True,truncation=True,return_tensors='pt')
  device = torch.device('cuda')
  tokens = tokens.to(device)
  with torch.no_grad():
    embeds = model(**tokens, output_hidden_states=True,return_dict=True).pooler_output
  return embeds.cpu()

def create_miread_embed(sents,bundle):
  tokenizer = bundle[0]
  model = bundle[1]
  model.cuda()
  tokens = tokenizer(sents,
                   max_length=512,
                   padding=True,
                   truncation=True,
                   return_tensors="pt"
                  )
  device = torch.device('cuda')
  tokens = tokens.to(device)
  with torch.no_grad():
    out = model.bert(**tokens)
    feature = out.last_hidden_state[:, 0, :]
  return feature.cpu()

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "biodatlab/MIReAD-Neuro-Large"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
faiss_embedder = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

def add_to_db(data,create_embed,bundle,name=''):
  batch_size = 128
  """
  data : list of rows with an 'abstract' and an 'identifier' field
  index : pinecone Index object
  create_embed : function that creates the embedding given an abstract
  """
  res = []
  vecdb = None
  for i in tqdm(range(0, len(data), batch_size)):
      # find end of batch
      i_end = min(i+batch_size, len(data))
      # create IDs batch
      ids = [name + '-' + str(x) for x in range(i, i_end)]
      # create metadata batch
      metadatas = [{
                    'journal':row.get('journal','None'),
                    'title':row['title'],
                    'abstract': row['abstract'],
                    'authors':row.get('authors','None'),
                    'link':row.get('link','None'),
                    'date':row.get('date','None'),
                    'submitter':row.get('submitter','None'),
                    } for row in data[i:i_end]]
      # create embeddings
      em = [create_embed(row['abstract'],bundle).tolist()[0] for row in data[i:i_end]]
      texts = [row['abstract'] for row in data[i:i_end]]
      records = list(zip(texts, em))
      if vecdb:
        vecdb_batch = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)
        vecdb.merge_from(vecdb_batch)
      else:
        vecdb = FAISS.from_embeddings(records,faiss_embedder,metadatas=metadatas,ids=ids)
  return vecdb

In [None]:
nbdt_data = pd.read_json('data_final.json')
aliases = pd.read_csv('id_list.csv')

In [None]:
aliases = aliases.drop_duplicates('Full Name')
aliases.head()

In [None]:
nbdt_data.head()

In [None]:
def load_nbdt(data,aliases):
  nbdt_records = []
  urls = []
  no_abst_count = 0
  no_journal_count = 0
  for row in aliases.itertuples():
    name = row[1]
    auth_ids = eval(row[2])
    auth_ids = [int(x) for x in auth_ids]
    papers = nbdt_data.loc[nbdt_data['authorId'].isin(auth_ids)]['papers']
    all_papers = []
    for paper_set in papers:
      all_papers.extend(paper_set)
    for paper in all_papers:
      url = paper['url']
      title = paper['title']
      abst = paper['abstract']
      year = paper['year']
      journal = paper.get('journal')
      if journal:
        journal = journal.get('name')
      else:
        journal = 'None'
        no_journal_count += 1
      authors = [name]
      if not(abst):
        abst = ''
        no_abst_count += 1
      record = {'journal':journal,'title':title,'abstract':abst,'link':url,'date':year,'authors':authors,'submitter':'None'}
      if url not in urls:
        nbdt_records.append(record)
        urls.append(url)
  return nbdt_records, (no_abst_count,no_journal_count)
nbdt_recs, no_counts = load_nbdt(nbdt_data,aliases)

In [None]:
nbdt_db = add_to_db(nbdt_recs,create_miread_embed,miread_bundle,'nbdt')
nbdt_db.save_local("nbdt_index")