riverbed / README.md
Ontocord.AI
Update README.md
83dbc50
|
raw
history blame
12.3 kB
metadata
license: apache-2.0
tags:
  - MDEL

Model Name

Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts

Model Description

This model was generated by averaging the weights of the following models

NOTE: There is a mistake below where we are using a routed expert for pubmed-abstract, but we merged pubmed central

import os
try:
  import transformers, fasttext, huggingface_hub
except:
  os.system("pip install transformers huggingface_hub fasttext")
  
from transformers import AutoTokenizer, AutoModelForCausalLM
import fasttext
from huggingface_hub import hf_hub_download
import torch
from torch import nn
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXLayer

class GPTNeoXExpertsForCasualLM(GPTNeoXForCausalLM):
  """ Stores various experts for layers 9, 10 """ # , 11
  __expert_classifier = None

  def __init__(self, config):
    global __expert_classifier
    super().__init__(config)
    self.config = config
    self.orig_chat = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.uspto_expert  = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.github_expert  = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.pubmed_abstracts_expert  = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.freelaw_expert  = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.arxiv_expert  = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.merged_chat_expert  = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)])
    self.curr_expert = "MergedChat"
    if GPTNeoXExpertsForCasualLM.__expert_classifier is None: 
      file_name = hf_hub_download(repo_id="Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts", filename="expert_classify.ftz")
      GPTNeoXExpertsForCasualLM.__expert_classifier = fasttext.load_model(file_name)
      print (file_name)

  def predict_expert(self, text):
    """
    ['__label__StackExchange',
      '__label__PubMed-Abstracts',
      '__label__Github',
      '__label__USPTO-Backgrounds',
      '__label__Pile-CC',
      '__label__PubMed-Central',
      '__label__OpenWebText2',
      '__label__FreeLaw',
      '__label__Wikipedia-(en)',
      '__label__ArXiv',
      '__label__DM-Mathematics',
      '__label__NIH-ExPorter',
      '__label__HackerNews',
      '__label__Enron-Emails',
      '__label__OpenSubtitles',
      '__label__YoutubeSubtitles',
      '__label__Books3',
      '__label__EuroParl',
      '__label__Gutenberg-(PG-19)',
      '__label__PhilPapers',
      '__label__BookCorpus2',
      '__label__Ubuntu-IRC']
      """
    text = text.replace("<human>: ", " ").replace("<bot>: ", " ").strip().replace("\n", " ").replace("\r", " ").replace("  ", " ")
    answer = GPTNeoXExpertsForCasualLM.__expert_classifier.predict(text)
    label = answer[0][0].replace("__label__", "")
    score = answer[1][0]
    return (label, score)

  def generate_with_expert(self, text, tokenizer, expert="", return_answer_only=False, do_self_contrastive=True, max_length=128, min_length=1, max_return_sequences=1, do_sample=True, do_beam=False, device="cuda", target_lang=None):
    """Generates using one of the experts."""
    tokenizer.pad_token = tokenizer.eos_token 
    
    if type(text) is str:
      text = [text]
    #hack - let's assume a single expert per batch
    if not expert:
      label, score = self.predict_expert(text[0])
      if score > 0.8:
        if "PubMed" in label or  "FreeLaw" in label or "ArXiv" in label or "Github" in label or "USPTO" in label:
          expert = label
        else:
          expert = "MergedChat"
      else:
        expert = "MergedChat"
    if expert != self.curr_expert:
      print ("Switching to expert", expert)
      self.curr_expert = expert
      for layer_id in range(2):
        if expert == "OrigChat":
          self.gpt_neox.layers[layer_id+9] = self.orig_chat[layer_id]
        elif "USPTO" in expert:
          self.gpt_neox.layers[layer_id+9] = self.uspto_expert[layer_id]
        elif "Github" in expert:
          self.gpt_neox.layers[layer_id+9] = self.github_expert[layer_id]
        elif "PubMed" in expert:
          self.gpt_neox.layers[layer_id+9] = self.pubmed_abstracts_expert[layer_id]
        elif "ArXiv" in expert:
          self.gpt_neox.layers[layer_id+9] = self.arxiv_expert[layer_id]
        elif "FreeLaw" in expert:
          self.gpt_neox.layers[layer_id+9] = self.freelaw_expert[layer_id]
        else:
          self.gpt_neox.layers[layer_id+9] = self.merged_chat_expert[layer_id]
    text = [p.strip() for p in text]
    input_ids = tokenizer(text, return_tensors='pt',padding=True, truncation=True, max_length=max_length )
    input_ids = input_ids.to(device)
    with torch.no_grad():
      outputs = self.generate(
                **input_ids,
                max_length=max_length,
                repetition_penalty=1.1,
                min_length=min_length,
                do_sample=True,
                top_p=0.95,
                penalty_alpha=0.6 if do_self_contrastive else None, 
                top_k=10, 
                )
      ret = []
      for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here
        out = tokenizer.decode(outputs[i], skip_special_tokens=True)
        if return_answer_only:
          out = out[len(text[i]):].lstrip(".? \n\t")
        ret.append(out)
      
    return ret

tokenizer = AutoTokenizer.from_pretrained("theblackcat102/pythia-1b-deduped-sft")

tokenizer.pad_token = tokenizer.eos_token 

model = GPTNeoXExpertsForCasualLM.from_pretrained("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts").half().cuda().eval()
 
print ('##')
print (model.generate_with_expert("source code for sorting a list <bot>:", tokenizer) [0])
print ('##')
print (model.generate_with_expert("When was Abraham Lincoln born? <bot>:", tokenizer) [0])
print ('##')
print (model.generate_with_expert("Medical journal article about ovarian cancer <bot>:", tokenizer) [0])     

Produces this output:

Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.
/root/.cache/huggingface/hub/models--Multi-Domain-Expert-Layers--MDEL-theblackcat-chat-5-experts/snapshots/a8a5b15d85eb0342825063fa1d7b83465f9eefa6/expert_classify.ftz
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
##
Switching to expert Github
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
source code for sorting a list :  sort( sort( sorted( sorted( sorted( sorted( sorted( sorted( [ x, y      * x - 1 + y  / y ])) )  )  )) ) ). unique(). sort( key ='sorted' ). sort_key( sort_key = 0, reverse = True ) # [ 5.1.2, 5.1.3, 3, 2, 1] asd, 6, 7 # { 0.4 } asd # [ 5.2.7, 4.5.4, 6
##
Switching to expert MergedChat
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
When was Abraham Lincoln born? :  This is an interesting topic to be asked about and I will try to give a bit of information, but it is very important that you should read the book by Robert O'Brien (1834-1858). [Abraham Lincoln and His Family](_URL_0_) which I highly recommend. This is a biography of a man who lived during the American civil war and had been a US President for 8 years in 1860. There are quite a lot of books available on Lincoln's life and his family. It would help me if you were more specific as I only knew him
##
Switching to expert PubMed-Abstracts
Medical journal article about ovarian cancer :
 "Cancer of the Ova" by Susan K. Hines, PhD - On April 10th, 2007, researchers from the National Cancer Institute discovered a mechanism called [Cullin Kin Reductions](_URL_0_) that can reduce and prevent this deadly disease in humans. This is called an "Cullin-R family". These proteins are involved in protein synthesis in the cell that is responsible for cellular maintenance and repair, and these two groups (Kin reduction and Mutagenesis) work on this process all the way to DNA replication during DNA synthesis. In a

To recreate the expert, modify this script. We can also extend to do dynamic merging and/or experitment with different weights for different layers.


def recreate_merged_expert():
  model1 = GPTNeoXExpertsForCasualLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float()

  model2 = AutoModelForCausalLM.from_pretrained("stillerman/MDEL-pubmed-feelaw-github-arxiv").float()

  model_uspto = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-uspto").float()

  model_github = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-github").float()
  model_pubmed_abstracts = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-pubmed_abstracts").float()
  model_freelaw = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-freelaw").float()
  model_arxiv = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-arxiv").float()

  model = AutoModelForCausalLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() # half().cuda().eval()

  with torch.no_grad():
    for layer_id in [9,10]: #9,10,11,12,13
      model1.orig_chat[layer_id-9] = model.gpt_neox.layers[layer_id]
    
    for layer_id in [9,10]: #9,10,11,12,13
      for p1, p2, p3 in zip(model1.gpt_neox.layers[layer_id].parameters(), model2.gpt_neox.layers[layer_id].parameters(), model_uspto.gpt_neox.layers[layer_id].parameters()):
        p1.data = p1.data*.6 + p2.data*0.3 + p3.data*0.1
      model1.merged_chat_expert[layer_id-9] = model1.gpt_neox.layers[layer_id]

    #model1.uspto_expert.layers_9_10_11 = []
    for layer_id in [9,10]: #9,10,11,12,13
      for p1, p2 in zip(model_uspto.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
        p1.data = p1.data*.6 + p2.data*0.4 
      model1.uspto_expert[layer_id-9] = model_uspto.gpt_neox.layers[layer_id]

    #model1.github_expert.layers_9_10_11 = []
    for layer_id in [9,10]: #9,10,11,12,13
      for p1, p2 in zip(model_github.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
        p1.data = p1.data*.6 + p2.data*0.4 
      model1.github_expert[layer_id-9] = model_github.gpt_neox.layers[layer_id]

    #model1.pubmed_abstracts_expert.layers_9_10_11 = []
    for layer_id in [9,10]: #9,10,11,12,13
      for p1, p2 in zip(model_pubmed_abstracts.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
        p1.data = p1.data*.6 + p2.data*0.4 
      model1.pubmed_abstracts_expert[layer_id-9] = model_pubmed_abstracts.gpt_neox.layers[layer_id]

    #model1.freelaw_expert.layers_9_10_11 = []
    for layer_id in [9,10]: #9,10,11,12,13
      for p1, p2 in zip(model_freelaw.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
        p1.data = p1.data*.6 + p2.data*0.4 
      model1.freelaw_expert[layer_id-9] = model_freelaw.gpt_neox.layers[layer_id]

    #model1.arxiv_expert.layers_9_10_11 = []
    for layer_id in [9,10]: #9,10,11,12,13
      for p1, p2 in zip(model_arxiv.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()):
        p1.data = p1.data*.6 + p2.data*0.4 
      model1.arxiv_expert[layer_id-9] = model_arxiv.gpt_neox.layers[layer_id]



  model1 = model1.half().eval()
  model1.save_pretrained("MDEL-theblackcat-chat-5-experts", torch_dtype=torch.float16)
  model1.push_to_hub("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts")
  return model1