import json import streamlit as st from transformers import AutoTokenizer, RobertaForMaskedLM, pipeline with open("config.json", encoding="utf8") as f: cfg = json.loads(f.read()) @st.cache(allow_output_mutation=True, show_spinner=False) def load_model(input_text, model_name_or_path): tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = RobertaForMaskedLM.from_pretrained(model_name_or_path) nlp = pipeline("fill-mask", model=model, tokenizer=tokenizer) result = nlp(input_text) sentence, mask = result[0]["sequence"], result[0]["token_str"] return sentence, mask, result def app(): st.title("RoBERTa Marathi") st.markdown( "This demo uses [RoBERTa for Marathi](https://huggingface.co/flax-community/roberta-base-mr) model " "trained on [mC4](https://huggingface.co/datasets/mc4)." ) st.markdown( "Can't figure out where to get a sample text? Visit this " "[link](https://maharashtratimes.com/entertainment/articlelist/19359255.cms), copy any headline and mask a word." ) masked_texts = [ "मोठी बातमी! उद्या दुपारी वाजता जाहीर होणार दहावीचा निकाल", "अध्यक्ष पवार आणि उपमुख्यमंत्री अजित पवार यांची भेट घेतली.", ] input_text = st.sidebar.selectbox("Select a Text", options=masked_texts) masked_text = st.text_input("Please type a masked sentence to fill", input_text) fill_button = st.button("Fill the Mask!") if fill_button: with st.spinner("Filling the Mask..."): filled_sentence, mask, raw_json = load_model(masked_text, cfg["models"]["RoBERTa"]) st.markdown(f"**Filled sentence: **{filled_sentence}") st.markdown(f"**Predicted masked token: **{mask}") st.write(raw_json)