Spaces:
Running
Running
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
from copy import deepcopy | |
import re | |
try: | |
import psyco | |
psyco.full() | |
except: | |
pass | |
try: | |
from zh_wiki import zh2Hant, zh2Hans | |
except ImportError: | |
from zhtools.zh_wiki import zh2Hant, zh2Hans | |
import sys | |
py3k = sys.version_info >= (3, 0, 0) | |
if py3k: | |
UEMPTY = '' | |
else: | |
_zh2Hant, _zh2Hans = {}, {} | |
for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)): | |
for k, v in old.items(): | |
new[k.decode('utf8')] = v.decode('utf8') | |
zh2Hant = _zh2Hant | |
zh2Hans = _zh2Hans | |
UEMPTY = ''.decode('utf8') | |
# states | |
(START, END, FAIL, WAIT_TAIL) = list(range(4)) | |
# conditions | |
(TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5)) | |
MAPS = {} | |
class Node(object): | |
def __init__(self, from_word, to_word=None, is_tail=True, | |
have_child=False): | |
self.from_word = from_word | |
if to_word is None: | |
self.to_word = from_word | |
self.data = (is_tail, have_child, from_word) | |
self.is_original = True | |
else: | |
self.to_word = to_word or from_word | |
self.data = (is_tail, have_child, to_word) | |
self.is_original = False | |
self.is_tail = is_tail | |
self.have_child = have_child | |
def is_original_long_word(self): | |
return self.is_original and len(self.from_word)>1 | |
def is_follow(self, chars): | |
return chars != self.from_word[:-1] | |
def __str__(self): | |
return '<Node, %s, %s, %s, %s>' % (repr(self.from_word), | |
repr(self.to_word), self.is_tail, self.have_child) | |
__repr__ = __str__ | |
class ConvertMap(object): | |
def __init__(self, name, mapping=None): | |
self.name = name | |
self._map = {} | |
if mapping: | |
self.set_convert_map(mapping) | |
def set_convert_map(self, mapping): | |
convert_map = {} | |
have_child = {} | |
max_key_length = 0 | |
for key in sorted(mapping.keys()): | |
if len(key)>1: | |
for i in range(1, len(key)): | |
parent_key = key[:i] | |
have_child[parent_key] = True | |
have_child[key] = False | |
max_key_length = max(max_key_length, len(key)) | |
for key in sorted(have_child.keys()): | |
convert_map[key] = (key in mapping, have_child[key], | |
mapping.get(key, UEMPTY)) | |
self._map = convert_map | |
self.max_key_length = max_key_length | |
def __getitem__(self, k): | |
try: | |
is_tail, have_child, to_word = self._map[k] | |
return Node(k, to_word, is_tail, have_child) | |
except: | |
return Node(k) | |
def __contains__(self, k): | |
return k in self._map | |
def __len__(self): | |
return len(self._map) | |
class StatesMachineException(Exception): pass | |
class StatesMachine(object): | |
def __init__(self): | |
self.state = START | |
self.final = UEMPTY | |
self.len = 0 | |
self.pool = UEMPTY | |
def clone(self, pool): | |
new = deepcopy(self) | |
new.state = WAIT_TAIL | |
new.pool = pool | |
return new | |
def feed(self, char, map): | |
node = map[self.pool+char] | |
if node.have_child: | |
if node.is_tail: | |
if node.is_original: | |
cond = UNMATCHED_SWITCH | |
else: | |
cond = MATCHED_SWITCH | |
else: | |
cond = CONNECTOR | |
else: | |
if node.is_tail: | |
cond = TAIL | |
else: | |
cond = ERROR | |
new = None | |
if cond == ERROR: | |
self.state = FAIL | |
elif cond == TAIL: | |
if self.state == WAIT_TAIL and node.is_original_long_word(): | |
self.state = FAIL | |
else: | |
self.final += node.to_word | |
self.len += 1 | |
self.pool = UEMPTY | |
self.state = END | |
elif self.state == START or self.state == WAIT_TAIL: | |
if cond == MATCHED_SWITCH: | |
new = self.clone(node.from_word) | |
self.final += node.to_word | |
self.len += 1 | |
self.state = END | |
self.pool = UEMPTY | |
elif cond == UNMATCHED_SWITCH or cond == CONNECTOR: | |
if self.state == START: | |
new = self.clone(node.from_word) | |
self.final += node.to_word | |
self.len += 1 | |
self.state = END | |
else: | |
if node.is_follow(self.pool): | |
self.state = FAIL | |
else: | |
self.pool = node.from_word | |
elif self.state == END: | |
# END is a new START | |
self.state = START | |
new = self.feed(char, map) | |
elif self.state == FAIL: | |
raise StatesMachineException('Translate States Machine ' | |
'have error with input data %s' % node) | |
return new | |
def __len__(self): | |
return self.len + 1 | |
def __str__(self): | |
return '<StatesMachine %s, pool: "%s", state: %s, final: %s>' % ( | |
id(self), self.pool, self.state, self.final) | |
__repr__ = __str__ | |
class Converter(object): | |
def __init__(self, to_encoding): | |
self.to_encoding = to_encoding | |
self.map = MAPS[to_encoding] | |
self.start() | |
def feed(self, char): | |
branches = [] | |
for fsm in self.machines: | |
new = fsm.feed(char, self.map) | |
if new: | |
branches.append(new) | |
if branches: | |
self.machines.extend(branches) | |
self.machines = [fsm for fsm in self.machines if fsm.state != FAIL] | |
all_ok = True | |
for fsm in self.machines: | |
if fsm.state != END: | |
all_ok = False | |
if all_ok: | |
self._clean() | |
return self.get_result() | |
def _clean(self): | |
if len(self.machines): | |
self.machines.sort(key=lambda x: len(x)) | |
# self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y))) | |
self.final += self.machines[0].final | |
self.machines = [StatesMachine()] | |
def start(self): | |
self.machines = [StatesMachine()] | |
self.final = UEMPTY | |
def end(self): | |
self.machines = [fsm for fsm in self.machines | |
if fsm.state == FAIL or fsm.state == END] | |
self._clean() | |
def convert(self, string): | |
self.start() | |
for char in string: | |
self.feed(char) | |
self.end() | |
return self.get_result() | |
def get_result(self): | |
return self.final | |
def registery(name, mapping): | |
global MAPS | |
MAPS[name] = ConvertMap(name, mapping) | |
registery('zh-hant', zh2Hant) | |
registery('zh-hans', zh2Hans) | |
del zh2Hant, zh2Hans | |
def run(): | |
import sys | |
from optparse import OptionParser | |
parser = OptionParser() | |
parser.add_option('-e', type='string', dest='encoding', | |
help='encoding') | |
parser.add_option('-f', type='string', dest='file_in', | |
help='input file (- for stdin)') | |
parser.add_option('-t', type='string', dest='file_out', | |
help='output file') | |
(options, args) = parser.parse_args() | |
if not options.encoding: | |
parser.error('encoding must be set') | |
if options.file_in: | |
if options.file_in == '-': | |
file_in = sys.stdin | |
else: | |
file_in = open(options.file_in) | |
else: | |
file_in = sys.stdin | |
if options.file_out: | |
if options.file_out == '-': | |
file_out = sys.stdout | |
else: | |
file_out = open(options.file_out, 'wb') | |
else: | |
file_out = sys.stdout | |
c = Converter(options.encoding) | |
for line in file_in: | |
# print >> file_out, c.convert(line.rstrip('\n').decode( | |
file_out.write(c.convert(line.rstrip('\n').decode( | |
'utf8')).encode('utf8')) | |
if __name__ == '__main__': | |
run() |