kargaranamir commited on
Commit
77ab4a2
1 Parent(s): 6706fde

add preprocess

Browse files
Files changed (1) hide show
  1. app.py +31 -2
app.py CHANGED
@@ -6,7 +6,7 @@
6
  # This space is built based on AMR-KELEG/ALDi space.
7
  # GlotLID Space
8
 
9
-
10
  import constants
11
  import pandas as pd
12
  import streamlit as st
@@ -48,6 +48,34 @@ def get_script(text):
48
  return main_script, all_scripts
49
 
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  @st.cache_data
52
  def language_names(json_path):
53
  with open(json_path, 'r') as json_file:
@@ -161,6 +189,8 @@ def compute(sentences, version = 'v2'):
161
  probs = []
162
  labels = []
163
 
 
 
164
  for index, sent in enumerate(sentences):
165
 
166
  output = model_choice.predict(sent)
@@ -227,7 +257,6 @@ with tab1:
227
  clicked = st.button("Submit")
228
 
229
  if sent:
230
- sent = sent.replace('\n', ' ')
231
 
232
  probs, labels = compute([sent], version=version)
233
  prob = probs[0]
 
6
  # This space is built based on AMR-KELEG/ALDi space.
7
  # GlotLID Space
8
 
9
+ import string
10
  import constants
11
  import pandas as pd
12
  import streamlit as st
 
48
  return main_script, all_scripts
49
 
50
 
51
+ def preprocess_text(text):
52
+ """Apply preprocessing to the given text.
53
+ Args:
54
+ text: Thetext to be preprocessed.
55
+ Returns:
56
+ The preprocessed text.
57
+ """
58
+
59
+ # remove \n
60
+ text = text.replace('\n', ' ')
61
+
62
+ # get rid of characters that are ubiquitous
63
+ replace_by = " "
64
+ replacement_map = {
65
+ ord(c): replace_by
66
+ for c in string.punctuation + string.digits
67
+ }
68
+ text = text.translate(replacement_map)
69
+
70
+ # make multiple space one space
71
+ text = re.sub(r'\s+', ' ', text)
72
+
73
+ # strip the text
74
+ text = text.strip()
75
+
76
+ return text
77
+
78
+
79
  @st.cache_data
80
  def language_names(json_path):
81
  with open(json_path, 'r') as json_file:
 
189
  probs = []
190
  labels = []
191
 
192
+ sentences = [preprocess_text(sent) for sent in sentences]
193
+
194
  for index, sent in enumerate(sentences):
195
 
196
  output = model_choice.predict(sent)
 
257
  clicked = st.button("Submit")
258
 
259
  if sent:
 
260
 
261
  probs, labels = compute([sent], version=version)
262
  prob = probs[0]