Spaces:
Running
Running
import re | |
from simplemma import lemmatize | |
def flatten(xss): | |
return [x for xs in xss for x in xs] | |
def remove_all_brackets(text): | |
return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text) | |
def lemmatizing(x): | |
if x == "": | |
return "" | |
return lemmatize(x, lang="cs") | |
def build_reverse_dictionary(dictionary, apply_lemmatizing=False): | |
reverse_dictionary = {} | |
for key, values in dictionary.items(): | |
for value in values: | |
reverse_dictionary[value] = key | |
if apply_lemmatizing: | |
temp = lemmatizing(value) | |
if temp != value: | |
reverse_dictionary[temp] = key | |
return reverse_dictionary | |
def split_gazetteers_for_single_token_match(gazetteers): | |
result = {} | |
for k, v in gazetteers.items(): | |
result[k] = set(flatten([vv.split(" ") for vv in v])) | |
result[k] = {x for x in result[k] if len(x) > 2} | |
return result | |
def preprocess_gazetteers(gazetteers, config): | |
if config["split_person"]: | |
gazetteers["PER"].update(set([x for x in flatten([v.split(" ") for v in gazetteers["PER"]]) if len(x) > 2])) | |
if config["lemmatize"]: | |
for k, v in gazetteers.items(): | |
gazetteers[k] = set(flatten([(vv, lemmatizing(vv)) for vv in v if len(vv) > 2])) | |
if config["remove_brackets"]: | |
for k, v in gazetteers.items(): | |
gazetteers[k] = {remove_all_brackets(vv).strip() for vv in v if len(remove_all_brackets(vv).strip()) > 2} | |
if config["remove_numeric"]: | |
for k, v in gazetteers.items(): | |
gazetteers[k] = {vv for vv in v if not vv.isnumeric()} | |
if config["techniq_for_matching"] != "single": | |
gazetteers = split_gazetteers_for_single_token_match(gazetteers) | |
return gazetteers | |