Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
·
767bc4e
1
Parent(s):
fa65d76
Update insert_spaces.py
Browse files- insert_spaces.py +39 -0
insert_spaces.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Insert spaces, mypython/split_chinese.py."""
|
| 2 |
+
import re
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def insert_spaces(text: str, method: int = None) -> str:
|
| 6 |
+
r"""Insert space between Chinese characters.
|
| 7 |
+
|
| 8 |
+
To speed up, first check text contains more latin letters or Chinese charas, if more latin letters use insert_spaces(text,, 3) else use insert_spaces(text, None)
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
text: string of latin and Chinese chars
|
| 12 |
+
method:
|
| 13 |
+
None: default, re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")) # NOQA
|
| 14 |
+
1: re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
|
| 15 |
+
|
| 16 |
+
>>> insert_spaces("test亨利it四世上").strip()
|
| 17 |
+
'test 亨 利 it 四 世 上'
|
| 18 |
+
>>> insert_spaces("test亨利it四世上").strip().__len__()
|
| 19 |
+
17
|
| 20 |
+
|
| 21 |
+
"""
|
| 22 |
+
if method is None:
|
| 23 |
+
if re.findall(r"[a-zA-Z ]+", text).__len__() > len(text) // 2: # more latin # NOQA
|
| 24 |
+
method = 3
|
| 25 |
+
else: # more Chinese
|
| 26 |
+
method = 0
|
| 27 |
+
|
| 28 |
+
if method == 0:
|
| 29 |
+
return re.sub(r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " "))
|
| 30 |
+
elif method == 1:
|
| 31 |
+
return re.sub(r"[一-龟]|[^ 一-龟]+", r"\g<0> ", text)
|
| 32 |
+
elif method == 2:
|
| 33 |
+
return re.sub(r"[一-龟]|\d+|\w+", r"\g<0> ", text)
|
| 34 |
+
elif method == 3:
|
| 35 |
+
return re.sub(r"(?<=[^a-zA-Z\d])|(?=[^a-zA-Z\d])", " ", text)
|
| 36 |
+
else:
|
| 37 |
+
return re.sub(
|
| 38 |
+
r"(?<=[a-zA-Z\d]) (?=[a-zA-Z\d])", "", text.replace("", " ")
|
| 39 |
+
) # NOQA
|