Spaces:
Runtime error
Runtime error
# | |
# Copyright (c) 2013-present, Anoop Kunchukuttan | |
# All rights reserved. | |
# | |
# This source code is licensed under the MIT license found in the | |
# LICENSE file in the root directory of this source tree. | |
# | |
#Program for tokenizing Indian language input | |
# | |
# @author Anoop Kunchukuttan | |
# | |
""" | |
Tokenizer for Indian languages. Currently, simple punctuation-based tokenizers | |
are supported (see `trivial_tokenize`). Major Indian language punctuations are | |
handled. | |
""" | |
import string, re, sys | |
from indicnlp.common import IndicNlpException | |
### tokenizer patterns | |
triv_tokenizer_indic_pat=re.compile(r'(['+string.punctuation+r'\u0964\u0965'+r'])') | |
triv_tokenizer_urdu_pat=re.compile(r'(['+string.punctuation+r'\u0609\u060A\u060C\u061E\u066A\u066B\u066C\u066D\u06D4'+r'])') | |
## date, numbers, section/article numbering | |
pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+') | |
def trivial_tokenize_indic(text): | |
"""tokenize string for Indian language scripts using Brahmi-derived scripts | |
A trivial tokenizer which just tokenizes on the punctuation boundaries. | |
This also includes punctuations for the Indian language scripts (the | |
purna virama and the deergha virama). This is a language independent | |
tokenizer | |
Args: | |
text (str): text to tokenize | |
Returns: | |
list: list of tokens | |
""" | |
tok_str=triv_tokenizer_indic_pat.sub(r' \1 ',text.replace('\t',' ')) | |
# return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ') | |
s=re.sub(r'[ ]+',' ',tok_str).strip(' ') | |
# do not tokenize numbers and dates | |
new_s='' | |
prev=0 | |
for m in pat_num_seq.finditer(s): | |
start=m.start() | |
end=m.end() | |
if start>prev: | |
new_s=new_s+s[prev:start] | |
new_s=new_s+s[start:end].replace(' ','') | |
prev=end | |
new_s=new_s+s[prev:] | |
s=new_s | |
return s.split(' ') | |
def trivial_tokenize_urdu(text): | |
"""tokenize Urdu string | |
A trivial tokenizer which just tokenizes on the punctuation boundaries. | |
This also includes punctuations for the Urdu script. | |
These punctuations characters were identified from the Unicode database | |
for Arabic script by looking for punctuation symbols. | |
Args: | |
text (str): text to tokenize | |
Returns: | |
list: list of tokens | |
""" | |
tok_str=triv_tokenizer_urdu_pat.sub(r' \1 ',text.replace('\t',' ')) | |
return re.sub(r'[ ]+',' ',tok_str).strip(' ').split(' ') | |
def trivial_tokenize(text,lang='hi'): | |
"""trivial tokenizer for Indian languages using Brahmi for Arabic scripts | |
A trivial tokenizer which just tokenizes on the punctuation boundaries. | |
Major punctuations specific to Indian langauges are handled. | |
These punctuations characters were identified from the Unicode database. | |
Args: | |
text (str): text to tokenize | |
lang (str): ISO 639-2 language code | |
Returns: | |
list: list of tokens | |
""" | |
if lang=='ur': | |
return trivial_tokenize_urdu(text) | |
else: | |
return trivial_tokenize_indic(text) | |
# if __name__ == '__main__': | |
# if len(sys.argv)<4: | |
# print("Usage: python indic_tokenize.py <infile> <outfile> <language>") | |
# sys.exit(1) | |
# with open(sys.argv[1],'r', encoding='utf-8') as ifile: | |
# with open(sys.argv[2],'w', encoding='utf-8') as ofile: | |
# for line in ifile: | |
# tokenized_line=' '.join(trivial_tokenize(line,sys.argv[3])) | |
# ofile.write(tokenized_line) | |