serdarakyol commited on
Commit
9d02852
·
1 Parent(s): 2017989

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -36
README.md CHANGED
@@ -20,42 +20,7 @@ tokenizer = AutoTokenizer.from_pretrained("serdarakyol/interpress-turkish-news-c
20
 
21
  model = AutoModelForSequenceClassification.from_pretrained("serdarakyol/interpress-turkish-news-classification")
22
  ```
23
- ## NOTE: Please remember, for predict on BERT model, you don't actually need to preprocessing but the dataset was real world data. That why I needed to do some preprocessing. If you have normal news from any news web page, you can just copy the news and past. Then delete the first comment on ***prediction*** function. That's it.
24
 
25
- ```sh
26
- # PREPROCESSING
27
- import re
28
- my_punc = r"#$%&()*+-/:;<=>@[\]^_{|}~"
29
-
30
- def clean_url(content):
31
- reg_url=r'[\S]+\.(net|com|org|info|edu|gov|uk|de|ca|jp|fr|au|us|ru|ch|it|nel|se|no|es|mil)[\S]*\s?'
32
- pattern_url = re.compile(reg_url)
33
- content = pattern_url.sub('',content)
34
- return content
35
-
36
- def clean_email(content):
37
- reg_email='\S*@\S*\s?'
38
- pattern_email = re.compile(reg_email)
39
- content = pattern_email.sub('',content)
40
- return content
41
-
42
- def clean_punctuation(content):
43
- content = content.translate(content.maketrans("", "", my_punc))
44
- return content
45
-
46
- def clean_data(text):
47
- text = clean_url(text)
48
- text = clean_email(text)
49
- text = clean_punctuation(text)
50
-
51
- filtered_sentence = []
52
- for word in text.split(" "):
53
- if len(word) > 2:
54
- filtered_sentence.append(word)
55
-
56
- text = ' '.join(filtered_sentence)
57
- return text
58
- ```
59
  ```sh
60
  import torch
61
  import numpy as np
@@ -71,7 +36,6 @@ else:
71
  ```
72
  ```sh
73
  def prediction(news):
74
- news=clean_data(news)
75
  news=[news]
76
  indices=tokenizer.batch_encode_plus(
77
  news,
 
20
 
21
  model = AutoModelForSequenceClassification.from_pretrained("serdarakyol/interpress-turkish-news-classification")
22
  ```
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  ```sh
25
  import torch
26
  import numpy as np
 
36
  ```
37
  ```sh
38
  def prediction(news):
 
39
  news=[news]
40
  indices=tokenizer.batch_encode_plus(
41
  news,