import gradio as gr from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig, pipeline import textwrap import torch prompt = 'BEGINNING OF CONVERSATION: USER: \ I will provide you with two abstracts, I intend to use the author of the second to review the first. Tell me in a few words why or why not the second author is a good fit to review the first paper.\n\ Abstract To Be Reviewed: ' tokenizer = LlamaTokenizer.from_pretrained("samwit/koala-7b") base_model = LlamaForCausalLM.from_pretrained( "samwit/koala-7b", load_in_8bit=True, device=-1, device_map='auto', ) pipe = pipeline( "text-generation", model=base_model, tokenizer=tokenizer, max_length=1024, temperature=0.7, top_p=0.95, repetition_penalty=1.15, device=-1 ) def wrap_text_preserve_newlines(text, width=110): # Split the input text into lines based on newline characters lines = text.split('\n') # Wrap each line individually wrapped_lines = [textwrap.fill(line, width=width) for line in lines] # Join the wrapped lines back together using newline characters wrapped_text = '\n'.join(wrapped_lines) return wrapped_text def create_miread_embed(sents, bundle): tokenizer = bundle[0] model = bundle[1] model.cpu() tokens = tokenizer(sents, max_length=512, padding=True, truncation=True, return_tensors="pt" ) device = torch.device('cpu') tokens = tokens.to(device) with torch.no_grad(): out = model.bert(**tokens) feature = out.last_hidden_state[:, 0, :] return feature.cpu() def get_matches(query, k): matches = vecdb.similarity_search_with_score(query, k=k) return matches def inference(query,k=30): matches = get_matches(query,k) j_bucket = {} n_table = [] a_table = [] r_table = [] scores = [round(match[1].item(),3) for match in matches] min_score = min(scores) max_score = max(scores) normaliser = lambda x: round(1 - (x-min_score)/max_score,3) for i,match in enumerate(matches): doc = match[0] score = normaliser(round(match[1].item(),3)) title = doc.metadata['title'] author = eval(doc.metadata['authors'])[0] date = doc.metadata.get('date','None') link = doc.metadata.get('link','None') submitter = doc.metadata.get('submitter','None') journal = doc.metadata.get('journal','None') # For journals if journal not in j_bucket: j_bucket[journal] = score else: j_bucket[journal] += score # For authors record = [i+1, score, author, title, link, date] n_table.append(record) # For abstracts record = [i+1, title, author, submitter, journal, date, link, score ] a_table.append(record) # For reviewer output = pipe(prompt + query + '\n Candidate Abstract: ' + candidate + '\n') r_record = [i+1, score, author, title, output[0]['generated_text'], link, date] r_table.append(r_record) j_table = sorted([[journal,score] for journal,score in j_bucket.items()],key= lambda x : x[1],reverse=True) j_table = [[i+1,item[0],item[1]] for i,item in enumerate(j_table)] j_output= gr.Dataframe.update(value=j_table,visible=True) n_output= gr.Dataframe.update(value=n_table,visible=True) a_output = gr.Dataframe.update(value=a_table,visible=True) r_output = gr.Dataframe.update(value=r_table,visible=True) return [a_output,j_output,n_output,r_output] model_name = "biodatlab/MIReAD-Neuro" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': False} faiss_embedder = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) vecdb = FAISS.load_local("faiss_index", faiss_embedder) with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# NBDT Recommendation Engine for Editors") gr.Markdown("NBDT Recommendation Engine for Editors is a tool for neuroscience authors/abstracts/journalsrecommendation built for NBDT journal editors. \ It aims to help an editor to find similar reviewers, abstracts, and journals to a given submitted abstract.\ To find a recommendation, paste a `title[SEP]abstract` or `abstract` in the text box below and click \"Find Matches\".\ Then, you can hover to authors/abstracts/journals tab to find a suggested list.\ The data in our current demo is selected from 2018 to 2022. We will update the data monthly for an up-to-date publications.") abst = gr.Textbox(label="Abstract",lines=10) k = gr.Slider(1,100,step=1,value=50,label="Number of matches to consider") action_btn = gr.Button(value="Find Matches") with gr.Tab("Authors"): n_output = gr.Dataframe( headers=['No.','Score','Name','Title','Link','Date'], datatype=['number','number','str','str','str','str'], col_count=(6, "fixed"), wrap=True, visible=False ) with gr.Tab("Abstracts"): a_output = gr.Dataframe( headers=['No.','Title','Author','Corresponding Author','Journal','Date','Link','Score'], datatype=['number','str','str','str','str','str','str','number'], col_count=(8,"fixed"), wrap=True, visible=False ) with gr.Tab("Journals"): j_output = gr.Dataframe( headers=['No.','Name','Score'], datatype=['number','str','number'], col_count=(3, "fixed"), wrap=True, visible=False ) with gr.Tab("Reviewers New"): r_output = gr.Dataframe( headers=['No.','Score','Name','Title','Reasoning','Link','Date'], datatype=['number','number','str','str','str','str','str'], col_count=(7,"fixed"), wrap=True, visible=False ) action_btn.click(fn=inference, inputs=[ abst, k, # modes, ], outputs=[a_output,j_output,n_output,r_output], api_name="neurojane") demo.launch(debug=True)