Files changed (5) hide show
  1. README.txt +13 -0
  2. __init__.txt +0 -0
  3. app.py +18 -5
  4. gitattributes.txt +34 -0
  5. helper_funcs.py +49 -1
README.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Preprocessing
3
+ emoji: 🔥
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 3.32.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: veneta/preprocessing
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__init__.txt ADDED
File without changes
app.py CHANGED
@@ -1,26 +1,30 @@
1
  import pandas as pd
2
  import gradio as gr
3
 
4
- from helper_funcs import functions, INPUT_FILE_TYPE, OUTPUT_FILE_TYPE
5
 
6
 
7
  def run_function(selected_function, file_obj, input_column, output_column, output_type):
 
8
  if 'json' in file_obj.name.lower():
9
  df = pd.read_json(file_obj.name)
10
  if any([x in file_obj.name.lower() for x in ['csv', 'txt']]):
11
- df = pd.read_csv(file_obj.name)
12
 
13
  output_file = 'result' + output_type
14
 
15
  if input_column not in list(df.columns):
16
  raise gr.Error("Input column name: such column does not exist in dataframe!")
17
 
18
- return functions[selected_function](df, input_column, output_column, output_file)
 
19
 
20
 
21
  app = gr.Blocks()
22
 
 
23
  with app:
 
24
  gr.Markdown(
25
  """
26
  # Instructions
@@ -78,10 +82,19 @@ with app:
78
  file_types=OUTPUT_FILE_TYPE
79
  )
80
 
81
- gr.Button("Process").click(
 
 
 
 
 
 
 
82
  run_function,
83
  inputs=[selected_function, file_obj, input_column, output_column, output_type],
84
- outputs=[output_dataframe, output_csv]
85
  )
 
 
86
 
87
  app.launch()
 
1
  import pandas as pd
2
  import gradio as gr
3
 
4
+ from helper_funcs import functions, INPUT_FILE_TYPE, OUTPUT_FILE_TYPE, get_classla_stats_df
5
 
6
 
7
  def run_function(selected_function, file_obj, input_column, output_column, output_type):
8
+
9
  if 'json' in file_obj.name.lower():
10
  df = pd.read_json(file_obj.name)
11
  if any([x in file_obj.name.lower() for x in ['csv', 'txt']]):
12
+ df = pd.read_csv(file_obj.name, encoding='utf-8')
13
 
14
  output_file = 'result' + output_type
15
 
16
  if input_column not in list(df.columns):
17
  raise gr.Error("Input column name: such column does not exist in dataframe!")
18
 
19
+ funcs = functions[selected_function](df, input_column, output_column, output_file)
20
+ return funcs
21
 
22
 
23
  app = gr.Blocks()
24
 
25
+
26
  with app:
27
+ process_status = gr.State(False)
28
  gr.Markdown(
29
  """
30
  # Instructions
 
82
  file_types=OUTPUT_FILE_TYPE
83
  )
84
 
85
+ stats_plot = gr.BarPlot(
86
+ value = pd.DataFrame(columns=['value', 'count']),
87
+ x = 'value',
88
+ y = 'count'
89
+ )
90
+
91
+ process_button = gr.Button("Process")
92
+ process_button.click(
93
  run_function,
94
  inputs=[selected_function, file_obj, input_column, output_column, output_type],
95
+ outputs=[output_dataframe, output_csv],
96
  )
97
+ strats_button = gr.Button("Get Stats")
98
+ strats_button.click(get_classla_stats_df, inputs=None, outputs=stats_plot)
99
 
100
  app.launch()
gitattributes.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
helper_funcs.py CHANGED
@@ -1,5 +1,6 @@
1
  import ast
2
  import warnings
 
3
 
4
  import classla
5
  import pandas as pd
@@ -12,6 +13,8 @@ warnings.filterwarnings('ignore')
12
  INPUT_FILE_TYPE = ['.csv', '.json', '.txt']
13
  OUTPUT_FILE_TYPE = ['.csv', '.xlsx']
14
 
 
 
15
 
16
  def to_output(df, output_file):
17
  if 'xlsx' in output_file:
@@ -137,6 +140,45 @@ def get_classla_all(df, input_column, output_column, output_file):
137
  df[output_column] = [clarin_classla_result[index] for index in range(df.shape[0])]
138
  return to_output(df, output_file)
139
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
  def run_all(df, input_column, output_column, output_file):
142
  def load_file(output_file):
@@ -160,6 +202,9 @@ def run_all(df, input_column, output_column, output_file):
160
  _, _ = get_classla_all(df, 'extracted_sentences', 'classla_all', output_file)
161
  df = load_file(output_file)
162
  _, _ = get_classla_ner(df, 'extracted_sentences', 'classla_ner', output_file)
 
 
 
163
  return df.head(10), output_file
164
 
165
 
@@ -171,5 +216,8 @@ functions = {
171
  'separate sentences': get_sentences,
172
  'Classla NER': get_classla_ner,
173
  'Classla full result': get_classla_all,
174
- 'run all': run_all
 
175
  }
 
 
 
1
  import ast
2
  import warnings
3
+ from collections import Counter
4
 
5
  import classla
6
  import pandas as pd
 
13
  INPUT_FILE_TYPE = ['.csv', '.json', '.txt']
14
  OUTPUT_FILE_TYPE = ['.csv', '.xlsx']
15
 
16
+ STATS_OUTPUT = 'classla_stats'
17
+ OUTPUT_FILE_NAME = 'result.csv'
18
 
19
  def to_output(df, output_file):
20
  if 'xlsx' in output_file:
 
140
  df[output_column] = [clarin_classla_result[index] for index in range(df.shape[0])]
141
  return to_output(df, output_file)
142
 
143
+ def classla_stats(df, input_column, output_column, output_file):
144
+
145
+ def count_ner(ner_list: []):
146
+ counter = Counter()
147
+ for el in ner_list:
148
+ counter += Counter(el)
149
+ return str(dict(counter))
150
+
151
+
152
+ global STATS_OUTPUT
153
+ STATS_OUTPUT = output_column
154
+ global OUTPUT_FILE_NAME
155
+ OUTPUT_FILE_NAME = output_file
156
+
157
+
158
+ df[input_column] = df[input_column].apply(lambda x: ast.literal_eval(x))
159
+
160
+ if input_column != output_column:
161
+ df[output_column] = df[input_column]
162
+
163
+ clarin_classla_result = [count_ner(df.iloc[index][input_column]) for index in range(df.shape[0])]
164
+ df[output_column] = [clarin_classla_result[index] for index in range(df.shape[0])]
165
+ return to_output(df, output_file)
166
+
167
+
168
+ def get_classla_stats_df():
169
+ print(OUTPUT_FILE_NAME)
170
+ df = pd.read_csv(OUTPUT_FILE_NAME, encoding='utf-8')
171
+ df[STATS_OUTPUT] = df[STATS_OUTPUT].apply(lambda x: ast.literal_eval(x))
172
+
173
+ counter = Counter()
174
+ for _, line in df.iterrows():
175
+ counter += Counter(line[STATS_OUTPUT])
176
+
177
+ r = pd.DataFrame(dict(counter), index=range(len(dict(counter))))
178
+ r.drop_duplicates(inplace=True)
179
+ r = r.melt(var_name='value', value_name='count')
180
+ return r
181
+
182
 
183
  def run_all(df, input_column, output_column, output_file):
184
  def load_file(output_file):
 
202
  _, _ = get_classla_all(df, 'extracted_sentences', 'classla_all', output_file)
203
  df = load_file(output_file)
204
  _, _ = get_classla_ner(df, 'extracted_sentences', 'classla_ner', output_file)
205
+ df = load_file(output_file)
206
+ _, _ = classla_stats(df, 'classla_ner', 'classla_stats', output_file)
207
+ df = load_file(output_file)
208
  return df.head(10), output_file
209
 
210
 
 
216
  'separate sentences': get_sentences,
217
  'Classla NER': get_classla_ner,
218
  'Classla full result': get_classla_all,
219
+ 'classla stats': classla_stats,
220
+ 'run all': run_all,
221
  }
222
+
223
+