--- license: apache-2.0 tags: - MDEL --- # Model Name Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts # Model Description This model was generated by averaging the weights of the following models - [Multi-Domain-Expert-Layers/expert-pubmed_central](https://huggingface.co/Multi-Domain-Expert-Layers/expert-pubmed_central) - [Multi-Domain-Expert-Layers/expert-freelaw](https://huggingface.co/Multi-Domain-Expert-Layers/expert-freelaw) - [Multi-Domain-Expert-Layers/expert-github](https://huggingface.co/Multi-Domain-Expert-Layers/expert-github) - [Multi-Domain-Expert-Layers/expert-uspto](https://huggingface.co/Multi-Domain-Expert-Layers/expert-uspto) - [Multi-Domain-Expert-Layers/expert-arxiv](https://huggingface.co/Multi-Domain-Expert-Layers/expert-arxiv) - [theblackcat102/pythia-1b-deduped-sft](theblackcat102/pythia-1b-deduped-sft) - We also keep a mixture that is primarily one of the above as an expert that can be loaded on demand. ### NOTE: There is a mistake below where we are using a routed expert for pubmed-abstract, but we merged pubmed central #### [Try demo on colab](https://colab.research.google.com/drive/1GgB8H30L5r0N--gexdEweK5f1yJfxMd_?usp=sharing) ``` import os try: import transformers, fasttext, huggingface_hub except: os.system("pip install transformers huggingface_hub fasttext") from transformers import AutoTokenizer, AutoModelForCausalLM import fasttext from huggingface_hub import hf_hub_download import torch from torch import nn from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXLayer class GPTNeoXExpertsForCasualLM(GPTNeoXForCausalLM): """ Stores various experts for layers 9, 10 """ # , 11 __expert_classifier = None def __init__(self, config): global __expert_classifier super().__init__(config) self.config = config self.orig_chat = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.uspto_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.github_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.pubmed_abstracts_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.freelaw_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.arxiv_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.merged_chat_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.curr_expert = "MergedChat" if GPTNeoXExpertsForCasualLM.__expert_classifier is None: file_name = hf_hub_download(repo_id="Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts", filename="expert_classify.ftz") GPTNeoXExpertsForCasualLM.__expert_classifier = fasttext.load_model(file_name) print (file_name) def predict_expert(self, text): """ ['__label__StackExchange', '__label__PubMed-Abstracts', '__label__Github', '__label__USPTO-Backgrounds', '__label__Pile-CC', '__label__PubMed-Central', '__label__OpenWebText2', '__label__FreeLaw', '__label__Wikipedia-(en)', '__label__ArXiv', '__label__DM-Mathematics', '__label__NIH-ExPorter', '__label__HackerNews', '__label__Enron-Emails', '__label__OpenSubtitles', '__label__YoutubeSubtitles', '__label__Books3', '__label__EuroParl', '__label__Gutenberg-(PG-19)', '__label__PhilPapers', '__label__BookCorpus2', '__label__Ubuntu-IRC'] """ text = text.replace(": ", " ").replace(": ", " ").strip().replace("\n", " ").replace("\r", " ").replace(" ", " ") answer = GPTNeoXExpertsForCasualLM.__expert_classifier.predict(text) label = answer[0][0].replace("__label__", "") score = answer[1][0] return (label, score) def generate_with_expert(self, text, tokenizer, expert="", return_answer_only=False, do_self_contrastive=True, max_length=128, min_length=1, max_return_sequences=1, do_sample=True, do_beam=False, device="cuda", target_lang=None): """Generates using one of the experts.""" tokenizer.pad_token = tokenizer.eos_token if type(text) is str: text = [text] #hack - let's assume a single expert per batch if not expert: label, score = self.predict_expert(text[0]) if score > 0.8: if "PubMed" in label or "FreeLaw" in label or "ArXiv" in label or "Github" in label or "USPTO" in label: expert = label else: expert = "MergedChat" else: expert = "MergedChat" if expert != self.curr_expert: print ("Switching to expert", expert) self.curr_expert = expert for layer_id in range(2): if expert == "OrigChat": self.gpt_neox.layers[layer_id+9] = self.orig_chat[layer_id] elif "USPTO" in expert: self.gpt_neox.layers[layer_id+9] = self.uspto_expert[layer_id] elif "Github" in expert: self.gpt_neox.layers[layer_id+9] = self.github_expert[layer_id] elif "PubMed" in expert: self.gpt_neox.layers[layer_id+9] = self.pubmed_abstracts_expert[layer_id] elif "ArXiv" in expert: self.gpt_neox.layers[layer_id+9] = self.arxiv_expert[layer_id] elif "FreeLaw" in expert: self.gpt_neox.layers[layer_id+9] = self.freelaw_expert[layer_id] else: self.gpt_neox.layers[layer_id+9] = self.merged_chat_expert[layer_id] text = [p.strip() for p in text] input_ids = tokenizer(text, return_tensors='pt',padding=True, truncation=True, max_length=max_length ) input_ids = input_ids.to(device) with torch.no_grad(): outputs = self.generate( **input_ids, max_length=max_length, repetition_penalty=1.1, min_length=min_length, do_sample=True, top_p=0.95, penalty_alpha=0.6 if do_self_contrastive else None, top_k=10, ) ret = [] for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here out = tokenizer.decode(outputs[i], skip_special_tokens=True) if return_answer_only: out = out[len(text[i]):].lstrip(".? \n\t") ret.append(out) return ret tokenizer = AutoTokenizer.from_pretrained("theblackcat102/pythia-1b-deduped-sft") tokenizer.pad_token = tokenizer.eos_token model = GPTNeoXExpertsForCasualLM.from_pretrained("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts").half().cuda().eval() print ('##') print (model.generate_with_expert("source code for sorting a list :", tokenizer) [0]) print ('##') print (model.generate_with_expert("When was Abraham Lincoln born? :", tokenizer) [0]) print ('##') print (model.generate_with_expert("Medical journal article about ovarian cancer :", tokenizer) [0]) ``` ## Produces this output: ``` Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar. /root/.cache/huggingface/hub/models--Multi-Domain-Expert-Layers--MDEL-theblackcat-chat-5-experts/snapshots/a8a5b15d85eb0342825063fa1d7b83465f9eefa6/expert_classify.ftz Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. ## Switching to expert Github Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. source code for sorting a list : sort( sort( sorted( sorted( sorted( sorted( sorted( sorted( [ x, y * x - 1 + y / y ])) ) ) )) ) ). unique(). sort( key ='sorted' ). sort_key( sort_key = 0, reverse = True ) # [ 5.1.2, 5.1.3, 3, 2, 1] asd, 6, 7 # { 0.4 } asd # [ 5.2.7, 4.5.4, 6 ## Switching to expert MergedChat Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. When was Abraham Lincoln born? : This is an interesting topic to be asked about and I will try to give a bit of information, but it is very important that you should read the book by Robert O'Brien (1834-1858). [Abraham Lincoln and His Family](_URL_0_) which I highly recommend. This is a biography of a man who lived during the American civil war and had been a US President for 8 years in 1860. There are quite a lot of books available on Lincoln's life and his family. It would help me if you were more specific as I only knew him ## Switching to expert PubMed-Abstracts Medical journal article about ovarian cancer : "Cancer of the Ova" by Susan K. Hines, PhD - On April 10th, 2007, researchers from the National Cancer Institute discovered a mechanism called [Cullin Kin Reductions](_URL_0_) that can reduce and prevent this deadly disease in humans. This is called an "Cullin-R family". These proteins are involved in protein synthesis in the cell that is responsible for cellular maintenance and repair, and these two groups (Kin reduction and Mutagenesis) work on this process all the way to DNA replication during DNA synthesis. In a ``` ### To recreate the expert, modify this script. We can also extend to do dynamic merging and/or experitment with different weights for different layers. ``` def recreate_merged_expert(): model1 = GPTNeoXExpertsForCasualLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() model2 = AutoModelForCausalLM.from_pretrained("stillerman/MDEL-pubmed-feelaw-github-arxiv").float() model_uspto = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-uspto").float() model_github = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-github").float() model_pubmed_abstracts = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-pubmed_abstracts").float() model_freelaw = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-freelaw").float() model_arxiv = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-arxiv").float() model = AutoModelForCausalLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() # half().cuda().eval() with torch.no_grad(): for layer_id in [9,10]: #9,10,11,12,13 model1.orig_chat[layer_id-9] = model.gpt_neox.layers[layer_id] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2, p3 in zip(model1.gpt_neox.layers[layer_id].parameters(), model2.gpt_neox.layers[layer_id].parameters(), model_uspto.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.3 + p3.data*0.1 model1.merged_chat_expert[layer_id-9] = model1.gpt_neox.layers[layer_id] #model1.uspto_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_uspto.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.uspto_expert[layer_id-9] = model_uspto.gpt_neox.layers[layer_id] #model1.github_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_github.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.github_expert[layer_id-9] = model_github.gpt_neox.layers[layer_id] #model1.pubmed_abstracts_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_pubmed_abstracts.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.pubmed_abstracts_expert[layer_id-9] = model_pubmed_abstracts.gpt_neox.layers[layer_id] #model1.freelaw_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_freelaw.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.freelaw_expert[layer_id-9] = model_freelaw.gpt_neox.layers[layer_id] #model1.arxiv_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_arxiv.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.arxiv_expert[layer_id-9] = model_arxiv.gpt_neox.layers[layer_id] model1 = model1.half().eval() model1.save_pretrained("MDEL-theblackcat-chat-5-experts", torch_dtype=torch.float16) model1.push_to_hub("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts") return model1 ```