napatswift commited on
Commit
fb358cd
·
1 Parent(s): 42c838f

Add requirements.txt and app.py files

Browse files
Files changed (2) hide show
  1. app.py +47 -0
  2. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
2
+ import torch
3
+ import pandas as pd
4
+ import numpy as np
5
+ import re
6
+ import gradio as gr
7
+
8
+ model_repo = "napatswift/mt5-fixpdftext"
9
+
10
+ tokenizer = AutoTokenizer.from_pretrained(model_repo)
11
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_repo)
12
+
13
+ embedding = list(model.modules())[1]
14
+ del model
15
+
16
+ def get_embedding(text):
17
+ return embedding(tokenizer(text, return_tensors='pt').input_ids[0]).mean(axis=0)
18
+
19
+ df = pd.read_csv('67_all_ministry.csv')
20
+
21
+ def get_name(row):
22
+ for col, val in row.items():
23
+ if col.startswith('name_') and val and isinstance(val, str):
24
+ return val
25
+ return
26
+
27
+ budget_items = df.apply(get_name, axis=1).unique().tolist()
28
+
29
+ budget_item_embeddings = torch.stack(list(map(get_embedding, budget_items)))
30
+
31
+ def get_closest_budget_item(text):
32
+ text_embedding = get_embedding(text)
33
+ scores = (budget_item_embeddings * text_embedding).sum(axis=1)
34
+ top_idx = scores.argsort(descending=True)[:5]
35
+ return pd.DataFrame({
36
+ 'budget_item': np.array(budget_items)[top_idx],
37
+ 'score': scores[top_idx].tolist()
38
+ })
39
+
40
+ demo = gr.Interface(
41
+ fn=get_closest_budget_item,
42
+ inputs=gr.inputs.Textbox(lines=5, label="Text"),
43
+ outputs='dataframe',
44
+ )
45
+
46
+ if __name__ == "__main__":
47
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ sentencepiece
2
+ transformers
3
+ pandas
4
+ numpy
5
+ gradio