Spaces:

bettystr
/

NerRoB-czech

Sleeping

App Files Files Community

AlzbetaStrompova commited on May 20

Commit

75a65be

•

1 Parent(s): 19e9ab7

minor changes

Browse files

Files changed (13) hide show

app.py +15 -11
data_manipulation/creation_gazetteers.py +115 -0
data_manipulation/dataset_funcions.py +124 -212
data_manipulation/preprocess_gazetteers.py +0 -54
extended_embeddings/__init__.py +0 -0
extended_embeddings/{token_classification.py → extended_embedding_token_classification.py} +13 -3
extended_embeddings/extended_embeddings_data_collator.py +77 -0
extended_embeddings/extended_embeddings_model.py +12 -39
flagged/log.csv +0 -8
requirements.txt +1 -0
style.css +6 -5
upload_model.ipynb +3 -3
website_script.py +32 -4

app.py CHANGED Viewed

@@ -1,32 +1,36 @@
-import json
 import gradio as gr
 from website_script import load, run
 tokenizer, model, gazetteers_for_matching = load()
 examples = [
-    ["Masarykova univerzita se nachází v Brně .", None],
-    ["Barack Obama navštívil Prahu minulý týden .", None],
-    ["Angela Merkelová se setkala s francouzským prezidentem v Paříži .", None],
-    ["Nobelova cena za fyziku byla udělena týmu vědců z MIT .", None]
-]
 def ner(text, file_names):
     result = run(tokenizer, model, gazetteers_for_matching, text, file_names)
     return {"text": text, "entities": result}
 with gr.Blocks(css="./style.css", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
     gr.Interface(ner,
-        gr.Textbox(lines=10, placeholder="Enter sentence here..."),
-        # gr.HighlightedText(show_legend=True, color_map={"PER": "red", "ORG": "green", "LOC": "blue"}),
-        gr.HighlightedText(show_legend=True, color_map={"PER": "#f57d7d", "ORG": "#2cf562", "LOC": "#86aafc"}, elem_id="highlighted_text"),
         examples=examples,
         title="NerROB-czech",
-        description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
         allow_flagging="never",
         additional_inputs=gr.File(label="Upload a JSON file containing gazetteers", file_count="multiple", file_types=[".json"]),
         )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from website_script import load, run
 tokenizer, model, gazetteers_for_matching = load()
 examples = [
+    ["Masarykova univerzita se nachází v Brně.", None],
+    ["Barack Obama navštívil Prahu minulý týden.", None],
+    ["Angela Merkelová se setkala s francouzským prezidentem v Paříži.", None],
+    ["Nobelova cena za fyziku byla udělena týmu vědců z MIT.", None],
+    ["Eiffelova věž je ikonickou památkou v Paříži.", None],
+    ["Bill Gates, spoluzakladatel společnosti Microsoft, oznámil nový grant pro výzkum umělé inteligence.", None],
+    ["Britská královna Alžběta II. navštívila Kanadu v rámci svého posledního zahraničního turné, během kterého zdůraznila důležitost spolupráce a přátelství mezi oběma národy.", None],
+    ["Francouzský prezident Emmanuel Macron oznámil nový plán na podporu start-upů a inovací ve Francii, který zahrnuje investice ve výši několika miliard eur.", None],
+    ["Světová zdravotnická organizace spustila nový program na boj proti malárii v subsaharské Africe, který zahrnuje rozdělování sítí proti komárům a očkování milionů lidí.", None]
+ ]
 def ner(text, file_names):
+    text = text.replace(".", " .")
     result = run(tokenizer, model, gazetteers_for_matching, text, file_names)
     return {"text": text, "entities": result}
 with gr.Blocks(css="./style.css", theme=gr.themes.Default(primary_hue="blue", secondary_hue="sky")) as demo:
     gr.Interface(ner,
+        gr.Textbox(lines=5, placeholder="Enter sentence here..."),
+        gr.HighlightedText(show_legend=True, color_map={"PER": "#f7a7a3", "ORG": "#77fc6a", "LOC": "#87CEFF"}),
         examples=examples,
         title="NerROB-czech",
+        description="This is an implementation of a Named Entity Recognition model for the Czech language using gazetteers.",
         allow_flagging="never",
         additional_inputs=gr.File(label="Upload a JSON file containing gazetteers", file_count="multiple", file_types=[".json"]),
         )
 if __name__ == "__main__":
+    demo.launch()

data_manipulation/creation_gazetteers.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import re
+import json
+import itertools
+import pandas as pd
+from simplemma import lemmatize
+from names_dataset import NameDataset
+def load_json(path):
+    """
+    Load gazetteers from a file
+    :param path: path to the gazetteer file
+    :return: a dict of gazetteers
+    """
+    with open(path, 'r') as file:
+        data = json.load(file)
+    return data
+def save_json(data, path):
+    """
+    Save gazetteers to a file
+    :param path: path to the gazetteer file
+    :param gazetteers: a dict of gazetteers
+    """
+    with open(path, 'w') as file:
+        json.dump(data, file, indent=4)
+def merge_gazetteers(*gazetteers):
+    """
+    Merge multiple gazetteer dictionaries into a single gazetteer dictionary.
+    Returns:
+        dict: A merged gazetteer dictionary containing all the keys and values from the input gazetteers.
+    """
+    # Initialize a new dictionary to store merged results
+    merged_gazetteers = {}
+    # Iterate over each dictionary provided
+    for gaz in gazetteers:
+        # Iterate over each key and set in the current dictionary
+        for key, value_set in gaz.items():
+            if key in merged_gazetteers:
+                # If the key already exists in the result, union the sets
+                merged_gazetteers[key] |= value_set
+            else:
+                # Otherwise, initialize the key with the set from the current dictionary
+                merged_gazetteers[key] = value_set.copy()  # Use copy to avoid mutating the original sets
+    return merged_gazetteers
+####################################################################################################
+### PREPROCESSING OF GAZETTEERS  ###################################################################
+####################################################################################################
+def remove_all_brackets(text):
+    return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)
+def lemmatizing(x):
+    if x == "":
+        return ""
+    return lemmatize(x, lang="cs")
+def multi_lemmatizing(x):
+    words = x.split(" ")
+    phrase = ""
+    for word in words:
+        phrase += lemmatizing(word) + " "
+    return phrase.strip()
+def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
+    reverse_dictionary = {}
+    for key, values in dictionary.items():
+        for value in values:
+            reverse_dictionary[value] = key
+            if apply_lemmatizing:
+                temp = lemmatizing(value)
+                if temp != value:
+                    reverse_dictionary[temp] = key
+    return reverse_dictionary
+def split_gazetteers_for_single_token_match(gazetteers):
+    result = {}
+    for k, v in gazetteers.items():
+        result[k] = set([x for xs in [vv.split(" ") for vv in v] for x in xs])
+        result[k] = {x for x in result[k] if len(x) > 2}
+    return result
+def preprocess_gazetteers(gazetteers, config):
+    if config["remove_brackets"]:
+        for k, values in gazetteers.items():
+            gazetteers[k] = {remove_all_brackets(vv).strip() for vv in values if len(remove_all_brackets(vv).strip()) > 2}
+    if config["split_person"]:
+        gazetteers["per"].update(set([x for x in list(itertools.chain(*[v.split(" ") for v in gazetteers["per"]])) if len(x) > 2]))
+    if config["techniq_for_matching"] == "single":
+        gazetteers = split_gazetteers_for_single_token_match(gazetteers)
+        if config["lemmatize"]:
+            for k, values in gazetteers.items():
+                gazetteers[k] = set(list(itertools.chain(*[(vv, lemmatizing(vv)) for vv in values if len(vv) > 2])))
+    elif config["lemmatize"]:
+        for k, values in gazetteers.items():
+                gazetteers[k] = set(list(itertools.chain(*[(value, multi_lemmatizing(value)) for value in values if len(value) > 2])))
+    if config["remove_numeric"]:
+        for k, values in gazetteers.items():
+            gazetteers[k] = {vv for vv in values if not vv.isnumeric()}
+    for k, values in gazetteers.items():
+        gazetteers[k] = list(values)
+    return gazetteers

data_manipulation/dataset_funcions.py CHANGED Viewed

@@ -1,27 +1,10 @@
 import os
 import re
-import json
 from tqdm import tqdm
 from datasets import Dataset, DatasetDict
-def load_gazetteers(path):
-    """
-    Load gazetteers from a file
-    :param path: path to the gazetteer file
-    :return: a dict of gazetteers
-    """
-    with open(path, 'r') as f:
-        gazetteers = json.load(f)
-    for k, v in gazetteers.items():
-        gazetteers[k] = set(v)
-    return gazetteers
-def create_dataset(label_mapper:dict, args):
-    if args.dataset == "cnec":
-        return create_cnec_dataset(label_mapper, args)
-    return load_wikiann_testing_dataset(args)
 ####################################################################################################
 ### GAZETTEERS EMBEDDINGS ##########################################################################
@@ -43,26 +26,36 @@ def find_multi_token_matches(tokens, looking_tokens, gazetteers, matches):
         i += 1
     return matches
-def find_single_token_matches(tokens, looking_tokens, gazetteers, matches):
-    return matches
-def find_combination_single_multi_token_matches(tokens, looking_tokens, gazetteers, matches):
     return matches
-def gazetteer_matching(words, gazetteers_for_matching):
-    single_token_match = False
-    ending_ova = False
-    apply_lemmatizing = False
-    if single_token_match:
-        matches = {}
     else:  # multi_token_match
         matches = find_multi_token_matches(words, words, gazetteers_for_matching, {})
-        # if apply_lemmatizing: TODO
-        #     lemmatize_tokens = [lemmatizing(t) for t in words]
-        #     matches = find_multi_token_matches(words, lemmatize_tokens, gazetteers_for_matching, matches)
     result = []
     for word in words:
@@ -70,72 +63,18 @@ def gazetteer_matching(words, gazetteers_for_matching):
         per, org, loc = 0, 0, 0
         for res in mid_res:
             if mid_res[0][0].count(" ") == res[0].count(" "):
-                if res[1] == "per":
-                    per = 1
-                elif res[1] == "org":
-                    org = 1
-                elif res[1] == "loc":
-                    loc = 1
         if ending_ova and word.endswith("ová") and word[0].isupper():
-            per = 1
         result.append([per, org, loc])
     return result
-####################################################################################################
-### GAZETTEERS EXPANSION TRAIN DATASET #############################################################
-####################################################################################################
-def expand_train_dataset_with_gazetteers(train, args):
-    if args.apply_extended_embeddings:
-        gazetteers_for_matching = load_gazetteers(args.extended_embeddings_gazetteers_path)
-    gazetteers = load_gazetteers(args.train_gazetteers_path)
-    count_gazetteers = {}
-    id_ = train[-1]["id"]
-    dataset = []
-    for row in train:
-        dataset.append({"id": row['id'], 'tokens': row['tokens'].copy(),
-                        'ner_tags': row['ner_tags'].copy(), 'gazetteers': row['gazetteers'].copy()})
-    for k in gazetteers.keys():
-        count_gazetteers[k] = 0
-    for index in range(args.gazetteers_counter):
-        for row in tqdm(train, desc=f"loop {index} from {args.gazetteers_counter}"):
-            i = 0
-            temp_1 = row["ner_tags"].copy()
-            temp_2 = row["tokens"].copy()
-            if temp_1.count(0) == len(temp_1):
-                continue
-            while i < len(temp_1):
-                tag = temp_1[i]
-                if tag % 2 == 1:
-                    tags = temp_1[:i]
-                    tokens = temp_2[:i]
-                    i += 1
-                    assert len(gazetteers[tag]) > count_gazetteers[tag]
-                    new = gazetteers[tag][count_gazetteers[tag]].split(" ")
-                    count_gazetteers[tag] += 1
-                    while i < len(temp_1):
-                        if temp_1[i] != tag + 1:
-                            break
-                        i += 1
-                    tags.append(tag)
-                    tags.extend([tag + 1] * (len(new) - 1))
-                    tags.extend(temp_1[i:])
-                    tokens.extend(new)
-                    tokens.extend(temp_2[i:])
-                    temp_1 = tags
-                    temp_2 = tokens
-                else:
-                    i += 1
-            id_ += 1
-            if args.apply_extended_embeddings:
-                matching = gazetteer_matching(temp_2, gazetteers_for_matching, args)
-                dataset.append({"id": id_, 'tokens': temp_2, 'ner_tags': temp_1, "gazetteers": matching})
-            dataset.append({"id": id_, 'tokens': temp_2, 'ner_tags': temp_1})
-    return dataset
 ####################################################################################################
 ### CNEC DATASET ###################################################################################
 ####################################################################################################
@@ -144,7 +83,6 @@ def get_dataset_from_cnec(label_mapper:dict, xml_file_path, args):
     label_mapper: cnec labels to int
     """
     # Open and read the XML file as plain text
-    assert os.path.isfile(xml_file_path)
     id_ = 0
     with open(xml_file_path, "r", encoding="utf-8") as xml_file:
         plain_text = xml_file.read()
@@ -156,14 +94,13 @@ def get_dataset_from_cnec(label_mapper:dict, xml_file_path, args):
     ne_pattern = r'<ne type="([a-zA-Z?_-]{1,5})">([^<]+)</ne>'
     data = []
     if args.apply_extended_embeddings:
-        gazetteers_for_matching = load_gazetteers(args.extended_embeddings_gazetteers_path)
-        from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
         temp = []
         for i in gazetteers_for_matching.keys():
             temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
         gazetteers_for_matching = temp
-    for sentence in tqdm(sentences):
         entity_mapping = []
         while "<ne type=" in sentence:  # while because there are nested entities
             nes = re.findall(ne_pattern, sentence)
@@ -215,7 +152,7 @@ def get_dataset_from_cnec(label_mapper:dict, xml_file_path, args):
         if tags_per_word == [] or tags_per_word == [0]:
             continue
         if args.apply_extended_embeddings:
-            matching = gazetteer_matching(words, gazetteers_for_matching)
             data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word,
                          "sentence": " ".join(words), "gazetteers": matching})
         else:
@@ -223,104 +160,78 @@ def get_dataset_from_cnec(label_mapper:dict, xml_file_path, args):
         id_ += 1
     return data
-def create_dataset2(label_mapper:dict, gazetteers_path):
-    path = "/nlp/projekty/gazetteer_ner/cnec2.0/data/xml"
-    dataset = DatasetDict()
-    for part, file_name in zip(["train", "validation", "test"],["named_ent_train.xml", "named_ent_etest.xml", "named_ent_dtest.xml"]):
-        file_path = os.path.join(path, file_name)
-        ##
-        id_ = 0
-        with open(file_path, "r", encoding="utf-8") as xml_file:
-            plain_text = xml_file.read()
-        plain_text = plain_text[5:-5]  # remove unnessery characters
-        plain_text = re.sub(r'([a-zA-Z.])<ne', r'\1 <ne', plain_text)
-        plain_text = re.sub(r'</ne>([a-zA-Z.])', r'</ne> \1', plain_text)
-        plain_text = re.sub(r'[ ]+', ' ', plain_text)
-        sentences = plain_text.split("\n")
-        ne_pattern = r'<ne type="([a-zA-Z?_-]{1,5})">([^<]+)</ne>'
-        data = []
-        if True:
-            gazetteers_for_matching = load_gazetteers(gazetteers_path)
-            from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
-            temp = []
-            for i in gazetteers_for_matching.keys():
-                temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
-            gazetteers_for_matching = temp
-        for sentence in tqdm(sentences):
-            entity_mapping = []
-            while "<ne type=" in sentence:  # while because there are nested entities
-                nes = re.findall(ne_pattern, sentence)
-                for label, entity in nes:
-                    pattern = f'<ne type="{label}">{entity}</ne>'
-                    index = sentence.index(pattern)
-                    temp_index = index
-                    sentence = sentence.replace(pattern, entity, 1)
-                    temp_index -= sum([len(f'<ne type="{tag}">') for tag in re.findall(r'<ne type="([a-zA-Z?_-]{1,5})">', sentence[:index])])
-                    temp_index -= sentence[:index].count("</ne>") * len("</ne>")
-                    temp_index -= (re.sub(r'<ne type="([a-zA-Z?_-]{1,5})">', "", sentence[:index]).replace("</ne>", "")).count("  ")
-                    index = temp_index
-                    entity_mapping.append((entity, label, index, index + len(entity)))
-            entities = []
-            for entity, label, start, end in entity_mapping:
-                for tag in label_mapper.keys():
-                    if label.lower().startswith(tag):
-                        entities.append((label_mapper[tag], entity, start, end))
-                        break
-            entities.sort(key=lambda x: len(x[1]), reverse=True)
-            words = re.split(r'\s+', sentence)
-            tags_per_word = []
-            sentence_counter = -1
-            for word in words:
-                sentence_counter += len(word) + 1
-                if len(entities) == 0:
-                    tags_per_word.append(0)  # tag representing no label for no word
-                for index_entity in range(len(entities)):
-                    if not(sentence_counter - len(word) >= entities[index_entity][2] and
-                        sentence_counter <= entities[index_entity][3] and
-                        word in entities[index_entity][1]):
-                        if index_entity == len(entities) - 1:
-                            tags_per_word.append(0)  # tag representing no label for word
-                        continue
-                    if True:
-                        if sentence_counter - len(word) == entities[index_entity][2]:
-                            tags_per_word.append(entities[index_entity][0] * 2 - 1) # beggining of entity
-                        else:
-                            tags_per_word.append(entities[index_entity][0] * 2)  # inside of entity
-                    else:
-                        tags_per_word.append(entities[index_entity][0])
                     break
-            if tags_per_word == [] or tags_per_word == [0]:
-                continue
-            if True:
-                matching = gazetteer_matching(words, gazetteers_for_matching)
-                data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word,
-                            "sentence": " ".join(words), "gazetteers": matching})
-            else:
-                data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word, "sentence": " ".join(words)})
-            id_ += 1
-        ##
-        dataset[part] = Dataset.from_list(data)
-    return dataset
 def create_cnec_dataset(label_mapper:dict, args):
-    assert os.path.isdir(args.cnec_dataset_dir_path)
     dataset = DatasetDict()
     for part, file_name in zip(["train", "validation", "test"],["named_ent_train.xml", "named_ent_etest.xml", "named_ent_dtest.xml"]):
         file_path = os.path.join(args.cnec_dataset_dir_path, file_name)
-        assert os.path.isfile(file_path)
         temp_dataset = get_dataset_from_cnec(label_mapper, file_path, args)
-        if args.expand_train_data:
-            temp_dataset = expand_train_dataset_with_gazetteers(temp_dataset, args)
         dataset[part] = Dataset.from_list(temp_dataset)
     return dataset
@@ -328,16 +239,19 @@ def create_cnec_dataset(label_mapper:dict, args):
 ### WIKIANN DATASET ################################################################################
 ####################################################################################################
 def load_wikiann_testing_dataset(args):
-    if args.apply_gazetteers_info:
-        gazetteers_for_matching = load_gazetteers(args.extended_embeddings_gazetteers_path)
-    assert os.path.isfile(args.wikiann_dataset_path)
     dataset = []
     index = 0
     sentences = load_tagged_sentences(args.wikiann_dataset_path)
     for sentence in sentences:
         words = [word for word, _ in sentence]
         tags = [tag for _, tag in sentence]
-        if args.apply_gazetteers_info:
             matching = gazetteer_matching(words, gazetteers_for_matching, args)
             dataset.append({"id": index, 'tokens': words, 'ner_tags': tags, "gazetteers": matching})
         else:
@@ -345,9 +259,10 @@ def load_wikiann_testing_dataset(args):
         index += 1
     test = Dataset.from_list(dataset)
-    # dataset = DatasetDict({"train": Dataset.from_list([{"id": 1, 'tokens': [], 'ner_tags': [], "gazetteers": []}]),
-    #                        "validation": Dataset.from_list([{"id": 1, 'tokens': [], 'ner_tags': [], "gazetteers": []}]), "test": test})
-    dataset = DatasetDict({"test": test})
     return dataset
@@ -400,26 +315,24 @@ def align_labels_with_tokens(labels, word_ids):
             new_labels.append(label)
     return new_labels
 def align_gazetteers_with_tokens(gazetteers, word_ids):
-    new_g = []
     current_word = None
     for word_id in word_ids:
         if word_id != current_word:
             # Start of a new word!
             current_word = word_id
             gazetteer = [0,0,0] if word_id is None else gazetteers[word_id]
-            new_g.append(gazetteer)
         elif word_id is None:
             # Special token
-            new_g.append([0,0,0])
         else:
             # Same word as previous token
             gazetteer = gazetteers[word_id]
-            # # If the label is B-XXX we change it to I-XXX
-            # if gazetteer % 2 == 1:
-            #     gazetteer += 1
-            new_g.append(gazetteer)
-    return new_g
 def create_tokenized_dataset(raw_dataset, tokenizer, apply_extended_embeddings=True):
@@ -434,25 +347,24 @@ def create_tokenized_dataset(raw_dataset, tokenizer, apply_extended_embeddings=T
             new_labels.append(align_labels_with_tokens(labels, word_ids))
         tokenized_inputs["labels"] = new_labels
         if apply_extended_embeddings:
-            g = examples["gazetteers"]
-            new_g = []
-            for i, g in enumerate(g):
                 word_ids = tokenized_inputs.word_ids(i)
-                new_g.append(align_gazetteers_with_tokens(g, word_ids))
-            p, o, l = [], [], []
-            for i in new_g:
-                p.append([x[0] for x in i])
-                o.append([x[1] for x in i])
-                l.append([x[2] for x in i])
-            tokenized_inputs["per"] = p
-            tokenized_inputs["org"] = o
-            tokenized_inputs["loc"] = l
         return tokenized_inputs
     dataset = raw_dataset.map(
         tokenize_and_align_labels,
         batched=True,
-        remove_columns=raw_dataset["train"].column_names,
     )
     return dataset

 import os
 import re
 from tqdm import tqdm
 from datasets import Dataset, DatasetDict
+from data_manipulation.creation_gazetteers import build_reverse_dictionary, lemmatizing, load_json
 ####################################################################################################
 ### GAZETTEERS EMBEDDINGS ##########################################################################
         i += 1
     return matches
+def find_single_token_matches(tokens, looking_tokens, gazetteers, matches):
+    n = len(tokens)
+    assert n == len(looking_tokens)
+    for index in range(n):
+        word = looking_tokens[index]
+        if len(word) < 3:
+            continue
+        for gazetteer in gazetteers:
+            if word in gazetteer:
+                match_type = gazetteer[word]
+                matches.setdefault(tokens[index], []).append((word, match_type))
     return matches
+def gazetteer_matching(words, gazetteers_for_matching, args=None):
+    ending_ova = True
+    method_for_gazetteers_matching = "single"
+    apply_lemmatizing = True
+    if method_for_gazetteers_matching == "single":
+        matches = find_single_token_matches(words, words, gazetteers_for_matching, {})
+        if apply_lemmatizing:
+            lemmatize_tokens = [lemmatizing(t) for t in words]
+            matches = find_single_token_matches(words, lemmatize_tokens, gazetteers_for_matching, matches)
     else:  # multi_token_match
         matches = find_multi_token_matches(words, words, gazetteers_for_matching, {})
+        if apply_lemmatizing:
+            lemmatize_tokens = [lemmatizing(t) for t in words]
+            matches = find_multi_token_matches(words, lemmatize_tokens, gazetteers_for_matching, matches)
     result = []
     for word in words:
         per, org, loc = 0, 0, 0
         for res in mid_res:
             if mid_res[0][0].count(" ") == res[0].count(" "):
+                if res[1] == "PER":
+                    per = 5
+                elif res[1] == "ORG":
+                    org = 5
+                elif res[1] == "LOC":
+                    loc = 5
         if ending_ova and word.endswith("ová") and word[0].isupper():
+            per = 5
         result.append([per, org, loc])
     return result
 ####################################################################################################
 ### CNEC DATASET ###################################################################################
 ####################################################################################################
     label_mapper: cnec labels to int
     """
     # Open and read the XML file as plain text
     id_ = 0
     with open(xml_file_path, "r", encoding="utf-8") as xml_file:
         plain_text = xml_file.read()
     ne_pattern = r'<ne type="([a-zA-Z?_-]{1,5})">([^<]+)</ne>'
     data = []
     if args.apply_extended_embeddings:
+        gazetteers_for_matching = load_json(args.extended_embeddings_gazetteers_path)
         temp = []
         for i in gazetteers_for_matching.keys():
             temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
         gazetteers_for_matching = temp
+    for sentence in tqdm(sentences):
         entity_mapping = []
         while "<ne type=" in sentence:  # while because there are nested entities
             nes = re.findall(ne_pattern, sentence)
         if tags_per_word == [] or tags_per_word == [0]:
             continue
         if args.apply_extended_embeddings:
+            matching = gazetteer_matching(words, gazetteers_for_matching, args)
             data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word,
                          "sentence": " ".join(words), "gazetteers": matching})
         else:
         id_ += 1
     return data
+def get_default_dataset_from_cnec(label_mapper:dict, xml_file_path):
+    """
+    label_mapper: cnec labels to int
+    """
+    # Open and read the XML file as plain text
+    id_ = 0
+    with open(xml_file_path, "r", encoding="utf-8") as xml_file:
+        plain_text = xml_file.read()
+    plain_text = plain_text[5:-5]  # remove unnessery characters
+    plain_text = re.sub(r'([a-zA-Z.])<ne', r'\1 <ne', plain_text)
+    plain_text = re.sub(r'</ne>([a-zA-Z.])', r'</ne> \1', plain_text)
+    plain_text = re.sub(r'[ ]+', ' ', plain_text)
+    sentences = plain_text.split("\n")
+    ne_pattern = r'<ne type="([a-zA-Z?_-]{1,5})">([^<]+)</ne>'
+    data = []
+    for sentence in tqdm(sentences):
+        entity_mapping = []
+        while "<ne type=" in sentence:  # while because there are nested entities
+            nes = re.findall(ne_pattern, sentence)
+            for label, entity in nes:
+                pattern = f'<ne type="{label}">{entity}</ne>'
+                index = sentence.index(pattern)
+                temp_index = index
+                sentence = sentence.replace(pattern, entity, 1)
+                temp_index -= sum([len(f'<ne type="{tag}">') for tag in re.findall(r'<ne type="([a-zA-Z?_-]{1,5})">', sentence[:index])])
+                temp_index -= sentence[:index].count("</ne>") * len("</ne>")
+                temp_index -= (re.sub(r'<ne type="([a-zA-Z?_-]{1,5})">', "", sentence[:index]).replace("</ne>", "")).count("  ")
+                index = temp_index
+                entity_mapping.append((entity, label, index, index + len(entity)))
+        entities = []
+        for entity, label, start, end in entity_mapping:
+            for tag in label_mapper.keys():
+                if label.lower().startswith(tag):
+                    entities.append((label_mapper[tag], entity, start, end))
                     break
+        entities.sort(key=lambda x: len(x[1]), reverse=True)
+        words = re.split(r'\s+', sentence)
+        tags_per_word = []
+        sentence_counter = -1
+        for word in words:
+            sentence_counter += len(word) + 1
+            if len(entities) == 0:
+                tags_per_word.append(0)  # tag representing no label for no word
+            for index_entity in range(len(entities)):
+                if not(sentence_counter - len(word) >= entities[index_entity][2] and
+                       sentence_counter <= entities[index_entity][3] and
+                       word in entities[index_entity][1]):
+                    if index_entity == len(entities) - 1:
+                        tags_per_word.append(0)  # tag representing no label for word
+                    continue
+                if sentence_counter - len(word) == entities[index_entity][2]:
+                    tags_per_word.append(entities[index_entity][0] * 2 - 1) # beggining of entity
+                else:
+                    tags_per_word.append(entities[index_entity][0] * 2)  # inside of entity
+        if tags_per_word == [] or tags_per_word == [0]:
+            continue
+        data.append({"id": id_, 'tokens': words, 'ner_tags': tags_per_word, "sentence": " ".join(words)})
+        id_ += 1
+    return data
 def create_cnec_dataset(label_mapper:dict, args):
     dataset = DatasetDict()
     for part, file_name in zip(["train", "validation", "test"],["named_ent_train.xml", "named_ent_etest.xml", "named_ent_dtest.xml"]):
         file_path = os.path.join(args.cnec_dataset_dir_path, file_name)
         temp_dataset = get_dataset_from_cnec(label_mapper, file_path, args)
         dataset[part] = Dataset.from_list(temp_dataset)
     return dataset
 ### WIKIANN DATASET ################################################################################
 ####################################################################################################
 def load_wikiann_testing_dataset(args):
+    if args.apply_extended_embeddings:
+        gazetteers_for_matching = load_json(args.extended_embeddings_gazetteers_path)
+        temp = []
+        for i in gazetteers_for_matching.keys():
+            temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))
+        gazetteers_for_matching = temp
     dataset = []
     index = 0
     sentences = load_tagged_sentences(args.wikiann_dataset_path)
     for sentence in sentences:
         words = [word for word, _ in sentence]
         tags = [tag for _, tag in sentence]
+        if args.apply_extended_embeddings:
             matching = gazetteer_matching(words, gazetteers_for_matching, args)
             dataset.append({"id": index, 'tokens': words, 'ner_tags': tags, "gazetteers": matching})
         else:
         index += 1
     test = Dataset.from_list(dataset)
+    dataset = DatasetDict({"train": Dataset.from_list([{"id": 1, 'tokens': [], 'ner_tags': [], "gazetteers": []}]),
+                           "validation": Dataset.from_list([{"id": 1, 'tokens': [], 'ner_tags': [], "gazetteers": []}]),
+                           "test": test})
+    # dataset = DatasetDict({"test": test})
     return dataset
             new_labels.append(label)
     return new_labels
 def align_gazetteers_with_tokens(gazetteers, word_ids):
+    aligned_gazetteers = []
     current_word = None
     for word_id in word_ids:
         if word_id != current_word:
             # Start of a new word!
             current_word = word_id
             gazetteer = [0,0,0] if word_id is None else gazetteers[word_id]
+            aligned_gazetteers.append(gazetteer)
         elif word_id is None:
             # Special token
+            aligned_gazetteers.append([0,0,0])
         else:
             # Same word as previous token
             gazetteer = gazetteers[word_id]
+            aligned_gazetteers.append(gazetteer)
+    return aligned_gazetteers
 def create_tokenized_dataset(raw_dataset, tokenizer, apply_extended_embeddings=True):
             new_labels.append(align_labels_with_tokens(labels, word_ids))
         tokenized_inputs["labels"] = new_labels
         if apply_extended_embeddings:
+            matches = examples["gazetteers"]
+            aligned_matches = []
+            for i, match in enumerate(matches):
                 word_ids = tokenized_inputs.word_ids(i)
+                aligned_matches.append(align_gazetteers_with_tokens(match, word_ids))
+            per, org, loc = [], [], []
+            for i in aligned_matches:
+                per.append([x[0] for x in i])
+                org.append([x[1] for x in i])
+                loc.append([x[2] for x in i])
+            tokenized_inputs["per"] = per
+            tokenized_inputs["org"] = org
+            tokenized_inputs["loc"] = loc
         return tokenized_inputs
     dataset = raw_dataset.map(
         tokenize_and_align_labels,
         batched=True,
+        # remove_columns=raw_dataset["train"].column_names
     )
     return dataset

data_manipulation/preprocess_gazetteers.py DELETED Viewed

@@ -1,54 +0,0 @@
-import re
-from simplemma import lemmatize
-def flatten(xss):
-    return [x for xs in xss for x in xs]
-def remove_all_brackets(text):
-    return re.sub(r'[\(\{\[].*?[\)\}\]]', '', text)
-def lemmatizing(x):
-    if x == "":
-        return ""
-    return lemmatize(x, lang="cs")
-def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
-    reverse_dictionary = {}
-    for key, values in dictionary.items():
-        for value in values:
-            reverse_dictionary[value] = key
-            if apply_lemmatizing:
-                temp = lemmatizing(value)
-                if temp != value:
-                    reverse_dictionary[temp] = key
-    return reverse_dictionary
-def split_gazetteers_for_single_token_match(gazetteers):
-    result = {}
-    for k, v in gazetteers.items():
-        result[k] = set(flatten([vv.split(" ") for vv in v]))
-        result[k] = {x for x in result[k] if len(x) > 2}
-    return result
-def preprocess_gazetteers(gazetteers, config):
-    if config["split_person"]:
-        gazetteers["PER"].update(set([x for x in flatten([v.split(" ") for v in gazetteers["PER"]]) if len(x) > 2]))
-    if config["lemmatize"]:
-        for k, v in gazetteers.items():
-            gazetteers[k] = set(flatten([(vv, lemmatizing(vv)) for vv in v if len(vv) > 2]))
-    if config["remove_brackets"]:
-        for k, v in gazetteers.items():
-            gazetteers[k] = {remove_all_brackets(vv).strip() for vv in v if len(remove_all_brackets(vv).strip()) > 2}
-    if config["remove_numeric"]:
-        for k, v in gazetteers.items():
-            gazetteers[k] = {vv for vv in v if not vv.isnumeric()}
-    if config["techniq_for_matching"] != "single":
-        gazetteers = split_gazetteers_for_single_token_match(gazetteers)
-    return gazetteers

extended_embeddings/__init__.py DELETED Viewed

File without changes

extended_embeddings/{token_classification.py → extended_embedding_token_classification.py} RENAMED Viewed

@@ -1,4 +1,4 @@
-from typing import List, Optional, Tuple, Union
 import torch
 from torch import nn
@@ -12,11 +12,20 @@ _CONFIG_FOR_DOC = "RobertaConfig"
 class ExtendedEmbeddigsRobertaForTokenClassification(RobertaForTokenClassification):
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
-        self.roberta = ExtendedEmbeddigsRobertaModel(config, add_pooling_layer=False)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
@@ -92,4 +101,5 @@ class ExtendedEmbeddigsRobertaForTokenClassification(RobertaForTokenClassificati
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
-        )

+from typing import Optional, Tuple, Union
 import torch
 from torch import nn
 class ExtendedEmbeddigsRobertaForTokenClassification(RobertaForTokenClassification):
+    """
+    A RobertaForTokenClassification for token classification tasks with extended embeddings.
+    This RobertaForTokenClassification extends the functionality of the `RobertaForTokenClassification` class
+    by adding support for additional features such as `per`, `org`, and `loc`.
+    Part of the code copied from: transformers.models.bert.modeling_roberta.RobertaForTokenClassification
+    """
     def __init__(self, config):
         super().__init__(config)
         self.num_labels = config.num_labels
+        self.roberta = ExtendedEmbeddigsRobertaModel(config)
         classifier_dropout = (
             config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
         )
             logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
+        )

extended_embeddings/extended_embeddings_data_collator.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+from transformers import DataCollatorForTokenClassification
+from transformers.data.data_collator import pad_without_fast_tokenizer_warning
+class ExtendedEmbeddingsDataCollatorForTokenClassification(DataCollatorForTokenClassification):
+    """
+    A data collator for token classification tasks with extended embeddings.
+    This data collator extends the functionality of the `DataCollatorForTokenClassification` class
+    by adding support for additional features such as `per`, `org`, and `loc`.
+    Part of the code copied from: transformers.data.data_collator.DataCollatorForTokenClassification
+    """
+    def torch_call(self, features):
+        label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
+        per = [feature["per"] for feature in features] if "per" in features[0].keys() else None
+        org = [feature["org"] for feature in features] if "org" in features[0].keys() else None
+        loc = [feature["loc"] for feature in features] if "loc" in features[0].keys() else None
+        no_labels_features = [{k: v for k, v in feature.items() if k not in [label_name, "per", "org", "loc"]} for feature in features]
+        batch = pad_without_fast_tokenizer_warning(
+            self.tokenizer,
+            no_labels_features,
+            padding=self.padding,
+            max_length=self.max_length,
+            pad_to_multiple_of=self.pad_to_multiple_of,
+            return_tensors="pt",
+        )
+        if labels is None:
+            return batch
+        sequence_length = batch["input_ids"].shape[1]
+        padding_side = self.tokenizer.padding_side
+        def to_list(tensor_or_iterable):
+            if isinstance(tensor_or_iterable, torch.Tensor):
+                return tensor_or_iterable.tolist()
+            return list(tensor_or_iterable)
+        if padding_side == "right":
+            batch[label_name] = [
+                to_list(label) + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels
+            ]
+            batch["per"] = [
+                to_list(p) + [0] * (sequence_length - len(p)) for p in per
+            ]
+            batch["org"] = [
+                to_list(o) + [0] * (sequence_length - len(o)) for o in org
+            ]
+            batch["loc"] = [
+                to_list(l) + [0] * (sequence_length - len(l)) for l in loc
+            ]
+        else:
+            batch[label_name] = [
+                [self.label_pad_token_id] * (sequence_length - len(label)) + to_list(label) for label in labels
+            ]
+            batch["per"] = [
+                [0] * (sequence_length - len(p)) + self.to_list(p) for p in per
+            ]
+            batch["org"] = [
+                [0] * (sequence_length - len(o)) + self.to_list(o) for o in org
+            ]
+            batch["loc"] = [
+                [0] * (sequence_length - len(l)) + self.to_list(l) for l in loc
+            ]
+        batch[label_name] = torch.tensor(batch[label_name], dtype=torch.int64)
+        batch["per"] = torch.tensor(batch["per"], dtype=torch.int64)
+        batch["org"] = torch.tensor(batch["org"], dtype=torch.int64)
+        batch["loc"] = torch.tensor(batch["loc"], dtype=torch.int64)
+        return batch

extended_embeddings/extended_embeddings_model.py CHANGED Viewed

@@ -1,53 +1,27 @@
-from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaEncoder, RobertaEmbeddings
-from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
 from typing import List, Optional, Tuple, Union
 import torch
-from torch.nn import functional as F
-from torch import nn
-# Copied from transformers.models.bert.modeling_bert.BertPooler
-class ExtendedEmbeddigsRobertaPooler(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        size_of_gazetters_part = int((len(config.id2label.keys()) - 1) // 2)
-        self.dense = nn.Linear(config.hidden_size + size_of_gazetters_part, config.hidden_size + size_of_gazetters_part)
-        self.activation = nn.Tanh()
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        # We "pool" the model by simply taking the hidden state corresponding
-        # to the first token.
-        first_token_tensor = hidden_states[:, 0]
-        pooled_output = self.dense(first_token_tensor)
-        pooled_output = self.activation(pooled_output)
-        return pooled_output
 class ExtendedEmbeddigsRobertaModel(RobertaModel):
     """
-    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
-    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
-    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
-    Kaiser and Illia Polosukhin.
-    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
-    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
-    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
-    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
     """
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
-    def __init__(self, config, add_pooling_layer=True):
         super().__init__(config)
         self.config = config
         self.embeddings = RobertaEmbeddings(config)
         self.encoder = RobertaEncoder(config)
-    #     self.gazetteers = GazetteersNetwork()  # change
-        self.pooler = ExtendedEmbeddigsRobertaPooler(config)
         # Initialize weights and apply final processing
         self.post_init()
@@ -57,10 +31,9 @@ class ExtendedEmbeddigsRobertaModel(RobertaModel):
         attention_mask: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
-     #   gazetteers_ids: Optional[torch.Tensor] = None,  # change
-        per: Optional[torch.Tensor] = None,  # change
-        org: Optional[torch.Tensor] = None,  # change
-        loc: Optional[torch.Tensor] = None,  # change
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,

 from typing import List, Optional, Tuple, Union
 import torch
+from transformers.modeling_outputs import BaseModelOutputWithPoolingAndCrossAttentions
+from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaEncoder, RobertaEmbeddings
 class ExtendedEmbeddigsRobertaModel(RobertaModel):
     """
+    A RobertaModel for token classification tasks with extended embeddings.
+    This RobertaModel extends the functionality of the `RobertaModel` class
+    by adding support for additional features such as `per`, `org`, and `loc`.
+    Part of the code copied from: transformers.models.bert.modeling_roberta.RobertaModel
     """
+    def __init__(self, config):
         super().__init__(config)
         self.config = config
         self.embeddings = RobertaEmbeddings(config)
         self.encoder = RobertaEncoder(config)
+        self.pooler = None
         # Initialize weights and apply final processing
         self.post_init()
         attention_mask: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
+        per: Optional[torch.Tensor] = None,
+        org: Optional[torch.Tensor] = None,
+        loc: Optional[torch.Tensor] = None,
         head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,

flagged/log.csv DELETED Viewed

@@ -1,8 +0,0 @@
-text,output,flag,username,timestamp
-Masarykova univerzita se nachází v Brně .,"[{""token"": """", ""class_or_confidence"": null}, {""token"": ""Masarykova univerzita"", ""class_or_confidence"": ""ORG""}, {""token"": "" se nach\u00e1z\u00ed v "", ""class_or_confidence"": null}, {""token"": ""Brn\u011b"", ""class_or_confidence"": ""LOC""}, {""token"": "" ."", ""class_or_confidence"": null}]",,,2024-05-06 02:29:01.157209
-Barack Obama navštívil Prahu minulý týden .,"[{""token"": """", ""class_or_confidence"": null}, {""token"": ""Barack Obama"", ""class_or_confidence"": ""OSV""}, {""token"": "" nav\u0161t\u00edvil "", ""class_or_confidence"": null}, {""token"": ""Prahu"", ""class_or_confidence"": ""LOC""}, {""token"": "" minul\u00fd t\u00fdden ."", ""class_or_confidence"": null}]",,,2024-05-06 02:31:57.950478
-Masarykova univerzita se nachází v Brně .,"[{""token"": """", ""class_or_confidence"": null}, {""token"": ""Masarykova univerzita"", ""class_or_confidence"": ""ORG""}, {""token"": "" se nach\u00e1z\u00ed v "", ""class_or_confidence"": null}, {""token"": ""Brn\u011b"", ""class_or_confidence"": ""LOC""}, {""token"": "" ."", ""class_or_confidence"": null}]",,,2024-05-06 02:51:30.197653
-Barack Obama navštívil Prahu minulý týden .,,,,2024-05-06 10:58:33.085992
-Masarykova univerzita se nachází v Brně .,"[{""token"": """", ""class_or_confidence"": null}, {""token"": ""Masarykova univerzita"", ""class_or_confidence"": ""ORG""}, {""token"": "" se nach\u00e1z\u00ed v "", ""class_or_confidence"": null}, {""token"": ""Brn\u011b"", ""class_or_confidence"": ""LOC""}, {""token"": "" ."", ""class_or_confidence"": null}]",,,2024-05-06 11:00:17.762652
-Masarykova univerzita se nachází v Brně .,"[{""token"": """", ""class_or_confidence"": null}, {""token"": ""Masarykova univerzita"", ""class_or_confidence"": ""ORG""}, {""token"": "" se nach\u00e1z\u00ed v "", ""class_or_confidence"": null}, {""token"": ""Brn\u011b"", ""class_or_confidence"": ""LOC""}, {""token"": "" ."", ""class_or_confidence"": null}]",,,2024-05-06 11:00:20.057269
-,,,,,2024-05-09 22:59:12.114264

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ torch
 simplemma
 gradio
 pandas

 simplemma
 gradio
 pandas
+name-datasets

style.css CHANGED Viewed

@@ -6,10 +6,6 @@ footer {
     color-scheme: light dark;
   }
-.container .svelte-ju12zg {
-    color: light-dark(black, white);
-}
 .text.svelte-ju12zg {
     padding: 0;
     margin: 0;
@@ -23,4 +19,9 @@ footer {
 .textspan.svelte-ju12zg.no-cat {
     margin: 0;
     padding: 0;
-}

     color-scheme: light dark;
   }
 .text.svelte-ju12zg {
     padding: 0;
     margin: 0;
 .textspan.svelte-ju12zg.no-cat {
     margin: 0;
     padding: 0;
+}
+.category-label.svelte-ju12zg {
+    background-color: light-dark(white, black,);
+}

upload_model.ipynb CHANGED Viewed

@@ -2,13 +2,13 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "65fea98bf7924f4fb4947d8e2dda2f4d",
        "version_major": 2,
        "version_minor": 0
       },
@@ -28,7 +28,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "556291d727474e0a82723d6459722b16",
        "version_major": 2,
        "version_minor": 0
       },
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {

website_script.py CHANGED Viewed

@@ -2,11 +2,39 @@ import json
 import copy
 import torch
 from transformers import AutoTokenizer
-from extended_embeddings.token_classification import ExtendedEmbeddigsRobertaForTokenClassification
-from data_manipulation.dataset_funcions import load_gazetteers, gazetteer_matching, align_gazetteers_with_tokens
-from data_manipulation.preprocess_gazetteers import build_reverse_dictionary
 def load():
@@ -18,7 +46,7 @@ def load():
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()
-    gazetteers_for_matching = load_gazetteers(gazetteers_path)
     temp = []
     for i in gazetteers_for_matching.keys():
         temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))

 import copy
 import torch
+from simplemma import lemmatize
 from transformers import AutoTokenizer
+from extended_embeddings.extended_embedding_token_classification import ExtendedEmbeddigsRobertaForTokenClassification
+from data_manipulation.dataset_funcions import gazetteer_matching, align_gazetteers_with_tokens
+# code originaly from data_manipulation.creation_gazetteers
+def lemmatizing(x):
+    if x == "":
+        return ""
+    return lemmatize(x, lang="cs")
+# code originaly from data_manipulation.creation_gazetteers
+def build_reverse_dictionary(dictionary, apply_lemmatizing=False):
+    reverse_dictionary = {}
+    for key, values in dictionary.items():
+        for value in values:
+            reverse_dictionary[value] = key
+            if apply_lemmatizing:
+                temp = lemmatizing(value)
+                if temp != value:
+                    reverse_dictionary[temp] = key
+    return reverse_dictionary
+def load_json(path):
+    """
+    Load gazetteers from a file
+    :param path: path to the gazetteer file
+    :return: a dict of gazetteers
+    """
+    with open(path, 'r') as file:
+        data = json.load(file)
+    return data
 def load():
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model.eval()
+    gazetteers_for_matching = load_json(gazetteers_path)
     temp = []
     for i in gazetteers_for_matching.keys():
         temp.append(build_reverse_dictionary({i: gazetteers_for_matching[i]}))