dsfsi-language-identification-spaces

Runtime error

kargaranamir HF staff commited on Oct 27, 2023

Commit

77ab4a2

1 Parent(s): 6706fde

add preprocess

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,7 +6,7 @@
 # This space is built based on AMR-KELEG/ALDi space.
 # GlotLID Space
 import constants
 import pandas as pd
 import streamlit as st
@@ -48,6 +48,34 @@ def get_script(text):
     return main_script, all_scripts
 @st.cache_data
 def language_names(json_path):
     with open(json_path, 'r') as json_file:
@@ -161,6 +189,8 @@ def compute(sentences, version = 'v2'):
     probs = []
     labels = []
     for index, sent in enumerate(sentences):
         output = model_choice.predict(sent)
@@ -227,7 +257,6 @@ with tab1:
     clicked = st.button("Submit")
     if sent:
-        sent = sent.replace('\n', ' ')
         probs, labels = compute([sent], version=version)
         prob = probs[0]

 # This space is built based on AMR-KELEG/ALDi space.
 # GlotLID Space
+import string
 import constants
 import pandas as pd
 import streamlit as st
     return main_script, all_scripts
+def preprocess_text(text):
+    """Apply preprocessing to the given text.
+    Args:
+        text: Thetext to be preprocessed.
+    Returns:
+        The preprocessed text.
+    """
+    # remove \n
+    text = text.replace('\n', ' ')
+    # get rid of characters that are ubiquitous
+    replace_by = " "
+    replacement_map = {
+        ord(c): replace_by
+        for c in string.punctuation + string.digits
+    }
+    text = text.translate(replacement_map)
+    # make multiple space one space
+    text = re.sub(r'\s+', ' ', text)
+    # strip the text
+    text = text.strip()
+    return text
 @st.cache_data
 def language_names(json_path):
     with open(json_path, 'r') as json_file:
     probs = []
     labels = []
+    sentences = [preprocess_text(sent) for sent in sentences]
     for index, sent in enumerate(sentences):
         output = model_choice.predict(sent)
     clicked = st.button("Submit")
     if sent:
         probs, labels = compute([sent], version=version)
         prob = probs[0]