PuoBERTaSpace

Sleeping

App Files Files Community

vukosi commited on Jun 1

Commit

3096ba9

1 Parent(s): 8b740b2

Filling Mask

Browse files

Files changed (1) hide show

app.py +52 -11

app.py CHANGED Viewed

@@ -54,9 +54,20 @@ st.sidebar.markdown("""
 # -------------------- CACHING FUNCTIONS --------------------
 @st.cache_resource
 def load_mask_filling_model():
-    tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa")
-    model = AutoModelForMaskedLM.from_pretrained("dsfsi/PuoBERTa")
-    return pipeline("fill-mask", model=model, tokenizer=tokenizer, top_k=5)
 @st.cache_resource
 def load_pos_model():
@@ -77,6 +88,23 @@ def load_news_classification_model():
     return pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
 # -------------------- UTILITY FUNCTIONS --------------------
 def merge_entities(output):
     """Merge consecutive entities of the same type"""
     merged = []
@@ -166,25 +194,31 @@ tab1, tab2, tab3, tab4 = st.tabs(["🎭 Mask Filling", "🏷️ POS Tagging", "
 # -------------------- MASK FILLING TAB --------------------
 with tab1:
     st.header("Mask Filling")
-    st.write("Fill in the blanks in Setswana sentences using `[MASK]` token.")
     mask_examples = [
-        "Ke rata go [MASK] dijo tsa Batswana.",
-        "Botswana ke naga e e [MASK] mo Afrika Borwa.",
-        "Bana ba [MASK] sekolo ka Mosupologo.",
-        "Re tshwanetse go [MASK] tikologo ya rona."
     ]
     mask_input = get_input_text("mask", mask_examples)
     if st.button("Fill Masks", key="mask_button") and mask_input.strip():
-        if "[MASK]" not in mask_input:
-            st.warning("Please include [MASK] token in your text.")
         else:
             with st.spinner("Filling masks..."):
                 try:
                     mask_filler = load_mask_filling_model()
-                    results = mask_filler(mask_input)
                     st.subheader("Predictions")
                     for i, result in enumerate(results, 1):
@@ -193,6 +227,13 @@ with tab1:
                 except Exception as e:
                     st.error(f"Error: {str(e)}")
 # -------------------- POS TAGGING TAB --------------------
 with tab2:

 # -------------------- CACHING FUNCTIONS --------------------
 @st.cache_resource
 def load_mask_filling_model():
+    try:
+        tokenizer = AutoTokenizer.from_pretrained("dsfsi/PuoBERTa")
+        model = AutoModelForMaskedLM.from_pretrained("dsfsi/PuoBERTa")
+        # Create pipeline and verify mask token
+        pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer, top_k=5)
+        # Debug: print mask token for verification
+        print(f"Mask token being used: {tokenizer.mask_token}")
+        return pipe
+    except Exception as e:
+        st.error(f"Failed to load mask filling model: {str(e)}")
+        return None
 @st.cache_resource
 def load_pos_model():
     return pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
 # -------------------- UTILITY FUNCTIONS --------------------
+def get_correct_mask_token(text, tokenizer):
+    """Get the correct mask token format for the given tokenizer"""
+    mask_token = tokenizer.mask_token
+    # Replace common mask token formats with the correct one
+    text = text.replace("[MASK]", mask_token)
+    text = text.replace("<mask>", mask_token)
+    text = text.replace("&lt;mask&gt;", mask_token)
+    return text
+# Then in your mask filling section, use:
+# corrected_input = get_correct_mask_token(mask_input, mask_filler.tokenizer)
+# results = mask_filler(corrected_input)
 def merge_entities(output):
     """Merge consecutive entities of the same type"""
     merged = []
 # -------------------- MASK FILLING TAB --------------------
 with tab1:
     st.header("Mask Filling")
+    st.write("Fill in the blanks in Setswana sentences using `<mask>` token.")
     mask_examples = [
+        "Ke rata go <mask> dijo tsa Batswana.",
+        "Botswana ke naga e e <mask> mo Afrika Borwa.",
+        "Bana ba <mask> sekolo ka Mosupologo.",
+        "Re tshwanetse go <mask> tikologo ya rona."
     ]
     mask_input = get_input_text("mask", mask_examples)
     if st.button("Fill Masks", key="mask_button") and mask_input.strip():
+        # Check for both mask formats and convert if needed
+        if "[MASK]" in mask_input:
+            mask_input = mask_input.replace("[MASK]", "<mask>")
+            st.info("Converted [MASK] to <mask> format")
+        elif "<mask>" not in mask_input:
+            st.warning("Please include <mask> token in your text.")
         else:
             with st.spinner("Filling masks..."):
                 try:
                     mask_filler = load_mask_filling_model()
+                    corrected_input = get_correct_mask_token(mask_input, mask_filler.tokenizer)
+                    results = mask_filler(corrected_input)
+                    # results = mask_filler(mask_input)
                     st.subheader("Predictions")
                     for i, result in enumerate(results, 1):
                 except Exception as e:
                     st.error(f"Error: {str(e)}")
+                    # Debug information
+                    st.info(f"Input text: {mask_input}")
+                    try:
+                        mask_filler = load_mask_filling_model()
+                        st.info(f"Model mask token: {mask_filler.tokenizer.mask_token}")
+                    except:
+                        pass
 # -------------------- POS TAGGING TAB --------------------
 with tab2: