Spaces:
Sleeping
Sleeping
# Streamlit app to highlight NER entities | |
import random | |
import streamlit as st | |
from datasets import load_dataset | |
from annotated_text import annotated_text | |
# Show highlighted ner entities in a tweet | |
def display_ner(example): | |
ner_output = example["ner_output"] | |
chunks = [] | |
current_chunk = "" | |
current_type = None | |
# Check if there are two labels repeated | |
previous_label = None | |
for label in ner_output["labels"]: | |
if ( | |
label | |
and previous_label | |
and previous_label == label | |
and label != "O" | |
and not label.startswith("I-") | |
and not label.startswith("B-") | |
): | |
pass | |
previous_label = label | |
for token, label in zip(ner_output["tokens"], ner_output["labels"]): | |
if label is None: | |
# Perhaps it is too long | |
continue | |
if label == "O": | |
if current_type is not None: | |
# Add previous entity | |
chunks.append((current_chunk.strip(), current_type)) | |
current_chunk = token + " " | |
current_type = None | |
else: | |
current_chunk += token + " " | |
current_type = None | |
elif label.startswith("B-"): | |
if current_chunk: | |
chunks.append((current_chunk.strip(), current_type)) | |
current_chunk = token + " " | |
current_type = label[2:] | |
elif label.startswith("I-"): | |
current_chunk += token + " " | |
current_type = label[2:] | |
else: | |
# It doesn't start with B- or I- => add single token | |
if label != current_type: | |
chunks.append((current_chunk.strip(), current_type)) | |
current_chunk = token + " " | |
current_type = label | |
else: | |
current_chunk += token + " " | |
current_type = label | |
if current_chunk: | |
chunks.append((current_chunk.strip(), current_type)) | |
# Display text | |
chunks = [(c, t) if t is not None else c for c, t in chunks] | |
annotated_text(*chunks) | |
def display_text(example, text_column): | |
# Use annotated_text to show entities | |
text = example[text_column] | |
# Sort entities by start | |
entities = sorted(example["entities"], key=lambda x: x["start"]) | |
for entity in entities: | |
entity_text = entity["text"] | |
# find in text | |
start = text.find(entity_text) | |
end = start + len(entity_text) | |
entity["start"] = start | |
entity["end"] = end | |
# Chunk text | |
if len(entities) == 0: | |
annotated_text(*[text]) | |
return | |
chunks = [] | |
last_index = 0 | |
for i in range(len(entities)): | |
entity = entities[i] | |
start, end = entity["start"], entity["end"] | |
if last_index < start: | |
chunk_before_entity = text[last_index : entity["start"]] | |
chunks.append((chunk_before_entity, None)) | |
chunks.append((entity["text"], entity["type"])) | |
last_index = end | |
if last_index < len(text): | |
chunks.append((text[last_index:], None)) | |
# description = entity["kg_result"]["detailedDescription"]["articleBody"] | |
chunks = [(c, t) if t is not None else c for c, t in chunks] | |
annotated_text(*chunks) | |
# selectbox to choose dataset | |
selected_dataset = st.sidebar.selectbox( | |
"Select dataset", ["hateval_enriched", "sbf-enriched", "hatecheck-enriched"] | |
) | |
# Load data | |
ds = load_dataset(f"hs-knowledge/{selected_dataset}") | |
text_column = { | |
"hateval_enriched": "text", | |
"sbf-enriched": "post", | |
"hatecheck-enriched": "test_case", | |
} | |
elements = random.choices(range(len(ds["train"])), k=50) | |
ds["train"] = ds["train"].select(elements) | |
for ex in ds["train"]: | |
# display_text(ex) | |
st.markdown("---") | |
display_ner(ex) | |
with st.expander("Show entities"): | |
for ent in ex["entities"]: | |
entity_name = ent["text"] | |
entity_type = ent["type"] | |
entity_description = ent["kg_result"]["detailedDescription"]["articleBody"] | |
st.write(f"{entity_name} ({entity_type}): {entity_description}") | |