--- license: apache-2.0 tags: - MDEL --- # Model Name Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts # Model Description This model was generated by averaging the weights of the following models - [Multi-Domain-Expert-Layers/expert-pubmed_central](https://huggingface.co/Multi-Domain-Expert-Layers/expert-pubmed_central) - [Multi-Domain-Expert-Layers/expert-freelaw](https://huggingface.co/Multi-Domain-Expert-Layers/expert-freelaw) - [Multi-Domain-Expert-Layers/expert-github](https://huggingface.co/Multi-Domain-Expert-Layers/expert-github) - [Multi-Domain-Expert-Layers/expert-uspto](https://huggingface.co/Multi-Domain-Expert-Layers/expert-uspto) - [Multi-Domain-Expert-Layers/expert-arxiv](https://huggingface.co/Multi-Domain-Expert-Layers/expert-arxiv) - [theblackcat102/pythia-1b-deduped-sft](theblackcat102/pythia-1b-deduped-sft) - We also keep a mixture that is primarily one of the above as an expert that can be loaded on demand. ### NOTE: There is a mistake below where we are using a routed expert for pubmed-abstract, but we merged pubmed central - [Try demo on colab](https://colab.research.google.com/drive/1GgB8H30L5r0N--gexdEweK5f1yJfxMd_?usp=sharing) ``` import os try: import transformers, fasttext, huggingface_hub except: os.system("pip install transformers huggingface_hub fasttext") from transformers import AutoTokenizer, AutoModelForCausalLM import fasttext from huggingface_hub import hf_hub_download import torch from torch import nn from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM, GPTNeoXLayer class GPTNeoXExpertsForCasualLM(GPTNeoXForCausalLM): """ Stores various experts for layers 9, 10 """ # , 11 __expert_classifier = None def __init__(self, config): global __expert_classifier super().__init__(config) self.config = config self.orig_chat = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.uspto_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.github_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.pubmed_abstracts_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.freelaw_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.arxiv_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.merged_chat_expert = nn.ModuleList([GPTNeoXLayer(config) for _ in range(2)]) self.curr_expert = "MergedChat" if GPTNeoXExpertsForCasualLM.__expert_classifier is None: file_name = hf_hub_download(repo_id="Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts", filename="expert_classify.ftz") GPTNeoXExpertsForCasualLM.__expert_classifier = fasttext.load_model(file_name) print (file_name) def predict_expert(self, text): """ ['__label__StackExchange', '__label__PubMed-Abstracts', '__label__Github', '__label__USPTO-Backgrounds', '__label__Pile-CC', '__label__PubMed-Central', '__label__OpenWebText2', '__label__FreeLaw', '__label__Wikipedia-(en)', '__label__ArXiv', '__label__DM-Mathematics', '__label__NIH-ExPorter', '__label__HackerNews', '__label__Enron-Emails', '__label__OpenSubtitles', '__label__YoutubeSubtitles', '__label__Books3', '__label__EuroParl', '__label__Gutenberg-(PG-19)', '__label__PhilPapers', '__label__BookCorpus2', '__label__Ubuntu-IRC'] """ text = text.replace(": ", " ").replace(": ", " ").strip().replace("\n", " ").replace("\r", " ").replace(" ", " ") answer = GPTNeoXExpertsForCasualLM.__expert_classifier.predict(text) label = answer[0][0].replace("__label__", "") score = answer[1][0] return (label, score) def generate_with_expert(self, text, tokenizer, expert="", return_answer_only=False, do_self_contrastive=True, max_length=128, min_length=1, max_return_sequences=1, do_sample=True, do_beam=False, device="cuda", target_lang=None): """Generates using one of the experts.""" tokenizer.pad_token = tokenizer.eos_token if type(text) is str: text = [text] #hack - let's assume a single expert per batch if not expert: label, score = self.predict_expert(text[0]) if "PubMed" in label or "FreeLaw" in label or "ArXiv" in label or "Github" in label or "USPTO" in label: if score > 0.8: expert = label elif score > 0.6: expert = "MergedChat" else: expert = "OrigChat" else: expert = "OrigChat" if expert != self.curr_expert: print ("Switching to expert", expert) self.curr_expert = expert for layer_id in range(2): if expert == "OrigChat": self.gpt_neox.layers[layer_id+9] = self.orig_chat[layer_id] elif "USPTO" in expert: self.gpt_neox.layers[layer_id+9] = self.uspto_expert[layer_id] elif "Github" in expert: self.gpt_neox.layers[layer_id+9] = self.github_expert[layer_id] elif "PubMed" in expert: self.gpt_neox.layers[layer_id+9] = self.pubmed_abstracts_expert[layer_id] elif "ArXiv" in expert: self.gpt_neox.layers[layer_id+9] = self.arxiv_expert[layer_id] elif "FreeLaw" in expert: self.gpt_neox.layers[layer_id+9] = self.freelaw_expert[layer_id] else: self.gpt_neox.layers[layer_id+9] = self.merged_chat_expert[layer_id] text = [p.strip() for p in text] input_ids = tokenizer(text, return_tensors='pt',padding=True, truncation=True, max_length=max_length ) input_ids = input_ids.to(device) with torch.no_grad(): outputs = self.generate( **input_ids, max_length=max_length, repetition_penalty=1.1, min_length=min_length, do_sample=True, top_p=0.95, penalty_alpha=0.6 if do_self_contrastive else None, top_k=10, ) ret = [] for i in range(len(outputs)): # can use batch_decode, unless we want to do something special here out = tokenizer.decode(outputs[i], skip_special_tokens=True) if return_answer_only: out = out[len(text[i]):].lstrip(".? \n\t") ret.append(out) return ret tokenizer = AutoTokenizer.from_pretrained("theblackcat102/pythia-1b-deduped-sft") tokenizer.pad_token = tokenizer.eos_token model = GPTNeoXExpertsForCasualLM.from_pretrained("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts").half().cuda().eval() print ('##') print (model.generate_with_expert("source code for sorting a list :", tokenizer) [0]) print ('##') print (model.generate_with_expert("When was Abraham Lincoln born? :", tokenizer) [0]) print ('##') print (model.generate_with_expert("Medical journal article about ovarian cancer :", tokenizer) [0]) ``` ## Produces this output: ``` ## Switching to expert Github Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. source code for sorting a list : [sort_by(a) > sort_by(([1, 2])) > sort_by([])] sort_by([sort_by([1, 2] + [10 < 5]), 1 - 10 < 5]) # Error: invalid input. [sort_by((1 - 4 - 6)) > sort_by((2 * 9))) > sort_by(-4 - (-6 - -7)) > sort_by(-8 - (-9 - -5)) > sort_by(-(-7 - (-8 - (9)))) > sort_by ## Switching to expert OrigChat Setting `pad_token_id` to `eos_token_id`:0 for open-end generation. When was Abraham Lincoln born? : I'm sorry. Lincoln's parents had died in a plane crash shortly after he was born, and so he spent most of his formative years being raised by his older brother, William Herndon (who also served as Lincoln's secretary during the Civil War). Lincoln's family had been poor all their lives, and were very close to poverty when he was an infant. As a result, there are many stories about him that show how he struggled with hunger at a young age, which led him to want to be able to eat whatever was available. He often ate only enough to ## Switching to expert PubMed-Abstracts Medical journal article about ovarian cancer : A retrospective study of a population-based cohort in Northern Ireland. In the 1990s and early 2000, there was an increase in the number of new patients with non-ovarian malignancy seen at the National Hospital for Women's Services (Nish) Cancer Screening Service. It is likely that the increase came from a screening programme in the Northern Ireland Health and Care Plan, where people who are not in employment were offered cancer screening by a group of local health care practitioners or nurses. This approach would be appropriate outside the Northern Ireland Cancer Screening Programme and I suspect it was the practice ``` ### To recreate the expert, modify this script. We can also extend to do dynamic merging and/or experitment with different weights for different layers. ``` def recreate_merged_expert(): model1 = GPTNeoXExpertsForCasualLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() model2 = AutoModelForCausalLM.from_pretrained("stillerman/MDEL-pubmed-feelaw-github-arxiv").float() model_uspto = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-uspto").float() model_github = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-github").float() model_pubmed_abstracts = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-pubmed_abstracts").float() model_freelaw = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-freelaw").float() model_arxiv = AutoModelForCausalLM.from_pretrained("Multi-Domain-Expert-Layers/expert-arxiv").float() model = AutoModelForCausalLM.from_pretrained("theblackcat102/pythia-1b-deduped-sft").float() # half().cuda().eval() with torch.no_grad(): for layer_id in [9,10]: #9,10,11,12,13 model1.orig_chat[layer_id-9] = model.gpt_neox.layers[layer_id] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2, p3 in zip(model1.gpt_neox.layers[layer_id].parameters(), model2.gpt_neox.layers[layer_id].parameters(), model_uspto.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.3 + p3.data*0.1 model1.merged_chat_expert[layer_id-9] = model1.gpt_neox.layers[layer_id] #model1.uspto_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_uspto.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.uspto_expert[layer_id-9] = model_uspto.gpt_neox.layers[layer_id] #model1.github_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_github.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.github_expert[layer_id-9] = model_github.gpt_neox.layers[layer_id] #model1.pubmed_abstracts_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_pubmed_abstracts.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.pubmed_abstracts_expert[layer_id-9] = model_pubmed_abstracts.gpt_neox.layers[layer_id] #model1.freelaw_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_freelaw.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.freelaw_expert[layer_id-9] = model_freelaw.gpt_neox.layers[layer_id] #model1.arxiv_expert.layers_9_10_11 = [] for layer_id in [9,10]: #9,10,11,12,13 for p1, p2 in zip(model_arxiv.gpt_neox.layers[layer_id].parameters(), model.gpt_neox.layers[layer_id].parameters()): p1.data = p1.data*.6 + p2.data*0.4 model1.arxiv_expert[layer_id-9] = model_arxiv.gpt_neox.layers[layer_id] model1 = model1.half().eval() model1.save_pretrained("MDEL-theblackcat-chat-5-experts", torch_dtype=torch.float16) model1.push_to_hub("Multi-Domain-Expert-Layers/MDEL-theblackcat-chat-5-experts") return model1 ```