Spaces:

clarin-knext
/

entity-linking

Runtime error

App Files Files Community

ajanz commited on Aug 24, 2023

Commit

66d0fee

1 Parent(s): fb53c32

an extended tokenizing function (as it was proposed in source project)

Browse files

Files changed (1) hide show

app.py +49 -5

app.py CHANGED Viewed

@@ -23,6 +23,49 @@ textbox = gr.Textbox(
 )
 def load_index(index_data: str = "clarin-knext/entity-linking-index"):
     ds = datasets.load_dataset(index_data, use_auth_token=auth_token)['train']
     index_data = {
@@ -44,7 +87,8 @@ model = load_model()
 index = load_index()
-def predict(query: str = sample_text, top_k: int=3):
     index_data, faiss_index = index
     # takes only the [CLS] embedding (for now)
     query = model(query, return_tensors = "pt")[0][0].numpy().reshape(1, -1)
@@ -52,13 +96,13 @@ def predict(query: str = sample_text, top_k: int=3):
     scores, indices = faiss_index.search(query, top_k)
     scores, indices = scores.tolist(), indices.tolist()
-    results = [
-        (index_data[result[0]], result[1])
         for output in zip(indices, scores)
         for result in zip(*output)
-    ]
-    return str(results)
 demo = gr.Interface(fn=predict, inputs=textbox, outputs="text").launch()

 )
+def prepare_query(tokenizer, query, max_seq_length=300):
+    # temporary solution
+    mention_start_token: str = "[unused0]"
+    mention_end_token: str = "[unused1]"
+    left_context = query.split(mention_start_token)[0]
+    right_context = query.split(mention_end_token)[-1]
+    mention = query.split(mention_start_token)[-1].split(mention_end_token)[0]
+    mention_ids = tokenizer(
+        mention_start_token + mention + mention_end_token,
+        add_special_tokens=False
+    )['input_ids']
+    left_ids = tokenizer(left_context, add_special_tokens=False)['input_ids']
+    left_quota = (max_seq_length - len(mention_ids)) // 2 - 1
+    right_ids = tokenizer(right_context, add_special_tokens=False)['input_ids']
+    right_quota = max_seq_length - len(mention_ids) - left_quota - 2
+    left_add, right_add = len(left_ids), len(right_ids)
+    if left_add <= left_quota:
+        right_quota += left_quota - left_add if right_add > right_quota else 0
+    else:
+        left_quota += right_quota - right_add if right_add <= right_quota else 0
+    context_ids = [
+        tokenizer.cls_token_id,
+        *left_ids[-left_quota:],
+        *mention_ids,
+        *right_ids[:right_quota],
+        tokenizer.sep_token_id
+    ]
+    padding_length = max_seq_length - len(context_ids)
+    # attention_mask = [1] * len(context_ids) + [0] * padding_length
+    context_ids += [tokenizer.pad_token_id] * padding_length
+    assert len(context_ids) == max_seq_length
+    return context_ids
 def load_index(index_data: str = "clarin-knext/entity-linking-index"):
     ds = datasets.load_dataset(index_data, use_auth_token=auth_token)['train']
     index_data = {
 index = load_index()
+def predict(text: str = sample_text, top_k: int=3):
+    query = prepare_query(text)
     index_data, faiss_index = index
     # takes only the [CLS] embedding (for now)
     query = model(query, return_tensors = "pt")[0][0].numpy().reshape(1, -1)
     scores, indices = faiss_index.search(query, top_k)
     scores, indices = scores.tolist(), indices.tolist()
+    results = "\n".join([
+        f"{index_data[result[0]]}: {result[1]}"
         for output in zip(indices, scores)
         for result in zip(*output)
+    ])
+    return results
 demo = gr.Interface(fn=predict, inputs=textbox, outputs="text").launch()