Spaces:
Sleeping
Sleeping
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM | |
import torch | |
import pandas as pd | |
import numpy as np | |
import re | |
import gradio as gr | |
model_repo = "napatswift/mt5-fixpdftext" | |
tokenizer = AutoTokenizer.from_pretrained(model_repo) | |
model = AutoModelForSeq2SeqLM.from_pretrained(model_repo) | |
embedding = list(model.modules())[1] | |
del model | |
def get_embedding(text): | |
return embedding(tokenizer(text, return_tensors='pt').input_ids[0]).mean(axis=0) | |
df = pd.read_csv('67_all_ministry.csv') | |
def get_name(row): | |
for col, val in row.items(): | |
if col.startswith('name_') and val and isinstance(val, str): | |
return val | |
return | |
budget_items = df.apply(get_name, axis=1).unique().tolist() | |
budget_item_embeddings = torch.stack(list(map(get_embedding, budget_items))) | |
def get_closest_budget_item(text, num_results=5): | |
text_embedding = get_embedding(text) | |
scores = torch.norm(budget_item_embeddings - text_embedding, dim=1) | |
top_idx = scores.argsort()[:num_results] | |
return pd.DataFrame({ | |
'budget_item': np.array(budget_items)[top_idx], | |
'score': scores[top_idx].tolist() | |
}) | |
demo = gr.Interface( | |
fn=get_closest_budget_item, | |
inputs=['textbox', gr.Slider(minimum=1, maximum=50, step=5, value=5, label="Number of results")], | |
outputs='dataframe', | |
) | |
if __name__ == "__main__": | |
demo.launch() |