Update README.md
Browse files
README.md
CHANGED
@@ -3,6 +3,69 @@ license: apache-2.0
|
|
3 |
---
|
4 |
### Deprem NER Training Results
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
```
|
7 |
training_args = TrainingArguments(
|
8 |
output_dir="./output",
|
@@ -15,24 +78,19 @@ training_args = TrainingArguments(
|
|
15 |
)
|
16 |
```
|
17 |
|
18 |
-
Threshold: 0.1
|
19 |
-
|
20 |
```
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
Alakasiz 0.92 0.87 0.89 734
|
24 |
-
Barinma 0.87 0.79 0.83 207
|
25 |
-
Elektronik 0.72 0.73 0.73 130
|
26 |
-
Giysi 0.84 0.66 0.74 94
|
27 |
-
Kurtarma 0.84 0.80 0.82 362
|
28 |
-
Lojistik 0.75 0.51 0.61 112
|
29 |
-
Saglik 0.79 0.80 0.79 108
|
30 |
-
Su 0.63 0.47 0.54 78
|
31 |
-
Yagma 0.75 0.58 0.65 31
|
32 |
-
Yemek 0.80 0.77 0.79 117
|
33 |
-
|
34 |
-
micro avg 0.85 0.78 0.81 1973
|
35 |
-
macro avg 0.79 0.70 0.74 1973
|
36 |
-
weighted avg 0.84 0.78 0.81 1973
|
37 |
-
samples avg 0.84 0.82 0.82 1973
|
38 |
```
|
|
|
3 |
---
|
4 |
### Deprem NER Training Results
|
5 |
|
6 |
+
```
|
7 |
+
precision recall f1-score support
|
8 |
+
|
9 |
+
0 0.85 0.91 0.88 734
|
10 |
+
1 0.77 0.84 0.80 207
|
11 |
+
2 0.71 0.88 0.79 130
|
12 |
+
3 0.68 0.76 0.72 94
|
13 |
+
4 0.80 0.85 0.82 362
|
14 |
+
5 0.63 0.59 0.61 112
|
15 |
+
6 0.73 0.82 0.77 108
|
16 |
+
7 0.55 0.77 0.64 78
|
17 |
+
8 0.65 0.71 0.68 31
|
18 |
+
9 0.70 0.85 0.76 117
|
19 |
+
|
20 |
+
micro avg 0.77 0.85 0.81 1973
|
21 |
+
macro avg 0.71 0.80 0.75 1973
|
22 |
+
weighted avg 0.77 0.85 0.81 1973
|
23 |
+
samples avg 0.82 0.87 0.83 1973
|
24 |
+
```
|
25 |
+
|
26 |
+
### Preprocessing Funcs
|
27 |
+
```
|
28 |
+
tr_stopwords = stopwords.words('turkish')
|
29 |
+
tr_stopwords.append("hic")
|
30 |
+
tr_stopwords.append("dm")
|
31 |
+
tr_stopwords.append("vs")
|
32 |
+
tr_stopwords.append("ya")
|
33 |
+
|
34 |
+
def remove_punct(tok):
|
35 |
+
tok = re.sub(r'[^\w\s]', '', tok)
|
36 |
+
return tok
|
37 |
+
|
38 |
+
def normalize(tok):
|
39 |
+
if tok.isdigit():
|
40 |
+
tok = "digit"
|
41 |
+
return tok
|
42 |
+
|
43 |
+
def clean(tok):
|
44 |
+
tok = remove_punct(tok)
|
45 |
+
tok = normalize(tok)
|
46 |
+
|
47 |
+
return tok
|
48 |
+
|
49 |
+
def exceptions(tok):
|
50 |
+
if not tok.isdigit() and len(tok)==1:
|
51 |
+
return False
|
52 |
+
|
53 |
+
if not tok:
|
54 |
+
return False
|
55 |
+
|
56 |
+
if tok in tr_stopwords:
|
57 |
+
return False
|
58 |
+
|
59 |
+
if tok.startswith('#') or tok.startswith("@"):
|
60 |
+
return False
|
61 |
+
|
62 |
+
return True
|
63 |
+
|
64 |
+
|
65 |
+
sm_tok = lambda text: [clean(tok) for tok in text.split(" ") if exceptions(tok)]
|
66 |
+
```
|
67 |
+
|
68 |
+
### Other HyperParams
|
69 |
```
|
70 |
training_args = TrainingArguments(
|
71 |
output_dir="./output",
|
|
|
78 |
)
|
79 |
```
|
80 |
|
|
|
|
|
81 |
```
|
82 |
+
class_weights[0] = 1.0
|
83 |
+
class_weights[1] = 1.5167249178108022
|
84 |
+
class_weights[2] = 1.7547338578655642
|
85 |
+
class_weights[3] = 1.9610520059358458
|
86 |
+
class_weights[4] = 1.269341370129623
|
87 |
+
class_weights[5] = 1.8684086209021484
|
88 |
+
class_weights[6] = 1.8019018017117145
|
89 |
+
class_weights[7] = 2.110648663094536
|
90 |
+
class_weights[8] = 3.081208739200435
|
91 |
+
class_weights[9] = 1.7994815143101963
|
92 |
+
```
|
93 |
+
|
94 |
+
Threshold: 0.25
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
```
|