Spaces:
Runtime error
Runtime error
File size: 4,032 Bytes
84ac7fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import pandas as pd
from datasets import load_metric
import os
import streamlit as st
from transformers import MarianMTModel, MarianTokenizer
@st.cache(allow_output_mutation=True)
def downloading_model():
sentence_pair_df = pd.read_json("sentence_pair.json")
metric = load_metric("sacrebleu")
tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-id-en")
original_model = MarianMTModel.from_pretrained(
"Helsinki-NLP/opus-mt-id-en")
finetuned_model = MarianMTModel.from_pretrained(
"wolfrage89/annual_report_translation_id_en")
return sentence_pair_df, metric, tokenizer, original_model, finetuned_model
def get_translation(model, tokenizer, text):
translated_tokens = model.generate(
**tokenizer([text], return_tensors='pt', max_length=104, truncation=True))[0]
translated_sentence = tokenizer.decode(
translated_tokens, skip_special_tokens=True)
return translated_sentence
def get_bleu_score(translated_sentence, reference_sentence, metric):
metric.add(prediction=translated_sentence, reference=[reference_sentence])
return metric.compute()['score']
# initalization
if "bahasa_input" not in st.session_state:
st.session_state["bahasa_input"] = ""
if "ideal_translation" not in st.session_state:
st.session_state['ideal_translation'] = ""
if "original_translation" not in st.session_state:
st.session_state['original_translation'] = ""
if "finetuned_translation" not in st.session_state:
st.session_state['finetuned_translation'] = ""
sentence_pair_df, metric, tokenizer, original_model, finetuned_model = downloading_model()
st.sidebar.title("Bahasa to English Translation (Finance Domain)")
st.sidebar.markdown("---")
random_button = st.sidebar.button(
"Random")
st.sidebar.write("Randomly generates a bahasa sentence")
st.sidebar.markdown("---")
translate_button = st.sidebar.button(
"Translate", help="translate bahasa to english")
st.sidebar.write("Translate!")
if random_button:
sample_data = sentence_pair_df.sample(1)
st.session_state['bahasa_input'] = sample_data['bahasa'].item()
st.session_state['ideal_translation'] = sample_data['english'].item()
st.session_state['original_translation'] = ""
st.session_state['finetuned_translation'] = ""
if translate_button:
if len(st.session_state['bahasa_input']) > 0:
st.session_state['original_translation'] = get_translation(
original_model, tokenizer, st.session_state['bahasa_input'])
st.session_state['finetuned_translation'] = get_translation(
finetuned_model, tokenizer, st.session_state['bahasa_input'])
original_bleu_score = get_bleu_score(
st.session_state['original_translation'], st.session_state['ideal_translation'], metric)
finetuned_bleu_score = get_bleu_score(
st.session_state['finetuned_translation'], st.session_state['ideal_translation'], metric)
else:
st.session_state['original_translation'] = ""
st.session_state['finetuned_translation'] = ""
st.session_state['ideal_translation'] = ""
original_bleu_score = 0
finetuned_bleu_score = 0
with st.container():
col_1, col_2 = st.columns(2)
with col_1:
st.session_state['bahasa_input'] = st.text_area(
"Bahasa (Input text here)", value=st.session_state['bahasa_input'], height=200)
st.text_area(
"Pretrained model Translation (Helsinki_id_en)", value=st.session_state['original_translation'], height=200)
if translate_button:
st.write("Bleu score: ", original_bleu_score)
with col_2:
st.text_area("Ideal translation (Target)",
value=st.session_state['ideal_translation'], height=200)
st.text_area("Finetuned translation (Finetuned on annual report)",
value=st.session_state['finetuned_translation'], height=200)
if translate_button:
st.write("Bleu Score: ", finetuned_bleu_score)
|