Spaces:
Runtime error
Runtime error
an extended tokenizing function (as it was proposed in source project)
Browse files
app.py
CHANGED
|
@@ -23,6 +23,49 @@ textbox = gr.Textbox(
|
|
| 23 |
)
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def load_index(index_data: str = "clarin-knext/entity-linking-index"):
|
| 27 |
ds = datasets.load_dataset(index_data, use_auth_token=auth_token)['train']
|
| 28 |
index_data = {
|
|
@@ -44,7 +87,8 @@ model = load_model()
|
|
| 44 |
index = load_index()
|
| 45 |
|
| 46 |
|
| 47 |
-
def predict(
|
|
|
|
| 48 |
index_data, faiss_index = index
|
| 49 |
# takes only the [CLS] embedding (for now)
|
| 50 |
query = model(query, return_tensors = "pt")[0][0].numpy().reshape(1, -1)
|
|
@@ -52,13 +96,13 @@ def predict(query: str = sample_text, top_k: int=3):
|
|
| 52 |
scores, indices = faiss_index.search(query, top_k)
|
| 53 |
scores, indices = scores.tolist(), indices.tolist()
|
| 54 |
|
| 55 |
-
results = [
|
| 56 |
-
|
| 57 |
for output in zip(indices, scores)
|
| 58 |
for result in zip(*output)
|
| 59 |
-
]
|
| 60 |
|
| 61 |
-
return
|
| 62 |
|
| 63 |
|
| 64 |
demo = gr.Interface(fn=predict, inputs=textbox, outputs="text").launch()
|
|
|
|
| 23 |
)
|
| 24 |
|
| 25 |
|
| 26 |
+
def prepare_query(tokenizer, query, max_seq_length=300):
|
| 27 |
+
# temporary solution
|
| 28 |
+
mention_start_token: str = "[unused0]"
|
| 29 |
+
mention_end_token: str = "[unused1]"
|
| 30 |
+
|
| 31 |
+
left_context = query.split(mention_start_token)[0]
|
| 32 |
+
right_context = query.split(mention_end_token)[-1]
|
| 33 |
+
mention = query.split(mention_start_token)[-1].split(mention_end_token)[0]
|
| 34 |
+
|
| 35 |
+
mention_ids = tokenizer(
|
| 36 |
+
mention_start_token + mention + mention_end_token,
|
| 37 |
+
add_special_tokens=False
|
| 38 |
+
)['input_ids']
|
| 39 |
+
|
| 40 |
+
left_ids = tokenizer(left_context, add_special_tokens=False)['input_ids']
|
| 41 |
+
left_quota = (max_seq_length - len(mention_ids)) // 2 - 1
|
| 42 |
+
|
| 43 |
+
right_ids = tokenizer(right_context, add_special_tokens=False)['input_ids']
|
| 44 |
+
right_quota = max_seq_length - len(mention_ids) - left_quota - 2
|
| 45 |
+
|
| 46 |
+
left_add, right_add = len(left_ids), len(right_ids)
|
| 47 |
+
if left_add <= left_quota:
|
| 48 |
+
right_quota += left_quota - left_add if right_add > right_quota else 0
|
| 49 |
+
else:
|
| 50 |
+
left_quota += right_quota - right_add if right_add <= right_quota else 0
|
| 51 |
+
|
| 52 |
+
context_ids = [
|
| 53 |
+
tokenizer.cls_token_id,
|
| 54 |
+
*left_ids[-left_quota:],
|
| 55 |
+
*mention_ids,
|
| 56 |
+
*right_ids[:right_quota],
|
| 57 |
+
tokenizer.sep_token_id
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
padding_length = max_seq_length - len(context_ids)
|
| 61 |
+
# attention_mask = [1] * len(context_ids) + [0] * padding_length
|
| 62 |
+
|
| 63 |
+
context_ids += [tokenizer.pad_token_id] * padding_length
|
| 64 |
+
|
| 65 |
+
assert len(context_ids) == max_seq_length
|
| 66 |
+
return context_ids
|
| 67 |
+
|
| 68 |
+
|
| 69 |
def load_index(index_data: str = "clarin-knext/entity-linking-index"):
|
| 70 |
ds = datasets.load_dataset(index_data, use_auth_token=auth_token)['train']
|
| 71 |
index_data = {
|
|
|
|
| 87 |
index = load_index()
|
| 88 |
|
| 89 |
|
| 90 |
+
def predict(text: str = sample_text, top_k: int=3):
|
| 91 |
+
query = prepare_query(text)
|
| 92 |
index_data, faiss_index = index
|
| 93 |
# takes only the [CLS] embedding (for now)
|
| 94 |
query = model(query, return_tensors = "pt")[0][0].numpy().reshape(1, -1)
|
|
|
|
| 96 |
scores, indices = faiss_index.search(query, top_k)
|
| 97 |
scores, indices = scores.tolist(), indices.tolist()
|
| 98 |
|
| 99 |
+
results = "\n".join([
|
| 100 |
+
f"{index_data[result[0]]}: {result[1]}"
|
| 101 |
for output in zip(indices, scores)
|
| 102 |
for result in zip(*output)
|
| 103 |
+
])
|
| 104 |
|
| 105 |
+
return results
|
| 106 |
|
| 107 |
|
| 108 |
demo = gr.Interface(fn=predict, inputs=textbox, outputs="text").launch()
|