Commit
·
9d02852
1
Parent(s):
2017989
Update README.md
Browse files
README.md
CHANGED
@@ -20,42 +20,7 @@ tokenizer = AutoTokenizer.from_pretrained("serdarakyol/interpress-turkish-news-c
|
|
20 |
|
21 |
model = AutoModelForSequenceClassification.from_pretrained("serdarakyol/interpress-turkish-news-classification")
|
22 |
```
|
23 |
-
## NOTE: Please remember, for predict on BERT model, you don't actually need to preprocessing but the dataset was real world data. That why I needed to do some preprocessing. If you have normal news from any news web page, you can just copy the news and past. Then delete the first comment on ***prediction*** function. That's it.
|
24 |
|
25 |
-
```sh
|
26 |
-
# PREPROCESSING
|
27 |
-
import re
|
28 |
-
my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~"
|
29 |
-
|
30 |
-
def clean_url(content):
|
31 |
-
reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?'
|
32 |
-
pattern_url = re.compile(reg_url)
|
33 |
-
content = pattern_url.sub('',content)
|
34 |
-
return content
|
35 |
-
|
36 |
-
def clean_email(content):
|
37 |
-
reg_email='\S*@\S*\s?'
|
38 |
-
pattern_email = re.compile(reg_email)
|
39 |
-
content = pattern_email.sub('',content)
|
40 |
-
return content
|
41 |
-
|
42 |
-
def clean_punctuation(content):
|
43 |
-
content = content.translate(content.maketrans("", "", my_punc))
|
44 |
-
return content
|
45 |
-
|
46 |
-
def clean_data(text):
|
47 |
-
text = clean_url(text)
|
48 |
-
text = clean_email(text)
|
49 |
-
text = clean_punctuation(text)
|
50 |
-
|
51 |
-
filtered_sentence = []
|
52 |
-
for word in text.split(" "):
|
53 |
-
if len(word) > 2:
|
54 |
-
filtered_sentence.append(word)
|
55 |
-
|
56 |
-
text = ' '.join(filtered_sentence)
|
57 |
-
return text
|
58 |
-
```
|
59 |
```sh
|
60 |
import torch
|
61 |
import numpy as np
|
@@ -71,7 +36,6 @@ else:
|
|
71 |
```
|
72 |
```sh
|
73 |
def prediction(news):
|
74 |
-
news=clean_data(news)
|
75 |
news=[news]
|
76 |
indices=tokenizer.batch_encode_plus(
|
77 |
news,
|
|
|
20 |
|
21 |
model = AutoModelForSequenceClassification.from_pretrained("serdarakyol/interpress-turkish-news-classification")
|
22 |
```
|
|
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
```sh
|
25 |
import torch
|
26 |
import numpy as np
|
|
|
36 |
```
|
37 |
```sh
|
38 |
def prediction(news):
|
|
|
39 |
news=[news]
|
40 |
indices=tokenizer.batch_encode_plus(
|
41 |
news,
|