CodeMonkeyXL

Sleeping

App Files Files Community

ceyda

aseifert commited on Feb 24, 2023

Commit

2d4811a

0 Parent(s):

Duplicate from aseifert/ExplaiNER

Browse files

Co-authored-by: Alexander Seifert <[email protected]>

Files changed (30) hide show

.gitattributes +27 -0
.gitignore +169 -0
Makefile +10 -0
README.md +78 -0
html/index.md +116 -0
html/screenshot.jpg +0 -0
requirements.txt +16 -0
src/__init__.py +0 -0
src/app.py +114 -0
src/data.py +228 -0
src/load.py +101 -0
src/model.py +33 -0
src/subpages/__init__.py +14 -0
src/subpages/attention.py +160 -0
src/subpages/debug.py +27 -0
src/subpages/emoji-en-US.json +0 -0
src/subpages/faiss.py +58 -0
src/subpages/find_duplicates.py +52 -0
src/subpages/hidden_states.py +194 -0
src/subpages/home.py +163 -0
src/subpages/inspect.py +41 -0
src/subpages/losses.py +67 -0
src/subpages/lossy_samples.py +115 -0
src/subpages/metrics.py +104 -0
src/subpages/misclassified.py +82 -0
src/subpages/page.py +50 -0
src/subpages/probing.py +57 -0
src/subpages/random_samples.py +50 -0
src/subpages/raw_data.py +57 -0
src/utils.py +255 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,27 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Created by https://www.gitignore.io/api/python,osx,linux
+# Edit at https://www.gitignore.io/?templates=python,osx,linux
+### Linux ###
+*~
+# temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+### OSX ###
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# Mr Developer
+.mr.developer.cfg
+.project
+.pydevproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# End of https://www.gitignore.io/api/python,osx,linux
+.idea/
+.ipynb_checkpoints/
+node_modules/
+data/books/
+docx/cache/
+docx/data/
+save_dir/
+cache_dir/
+outputs/
+models/
+runs/
+.vscode/
+doc/
+html/*.html
+vis2.zip

Makefile ADDED Viewed

	@@ -0,0 +1,10 @@

+doc:
+	pdoc --docformat google src -o doc
+vis2: doc
+	pandoc html/index.md -s -o html/index.html
+	rm -rf src/__pycache__ && rm -rf src/subpages/__pycache__
+	zip -r vis2.zip doc html src Makefile presentation.pdf requirements.txt
+run:
+	python -m streamlit run src/app.py

README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+---
+title: ExplaiNER
+emoji: 🏷️
+colorFrom: blue
+colorTo: indigo
+python_version: 3.9
+sdk: streamlit
+sdk_version: 1.10.0
+app_file: src/app.py
+pinned: true
+duplicated_from: aseifert/ExplaiNER
+---
+# 🏷️ ExplaiNER: Error Analysis for NER models & datasets
+Error Analysis is an important but often overlooked part of the data science project lifecycle, for which there is still very little tooling available. Practitioners tend to write throwaway code or, worse, skip this crucial step of understanding their models' errors altogether. This project tries to provide an extensive toolkit to probe any NER model/dataset combination, find labeling errors and understand the models' and datasets' limitations, leading the user on her way to further improvements.
+## Sections
+### Activations
+A group of neurons tends to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model.
+### Embeddings
+For every token in the dataset, we take its hidden state and project it onto a two-dimensional plane. Data points are colored by label/prediction, with disagreements marked by a small black border.
+### Probing
+A very direct and interactive way to test your model is by providing it with a list of text inputs and then inspecting the model outputs. The application features a multiline text field so the user can input multiple texts separated by newlines. For each text, the app will show a data frame containing the tokenized string, token predictions, probabilities and a visual indicator for low probability predictions -- these are the ones you should inspect first for prediction errors.
+### Metrics
+The metrics page contains precision, recall and f-score metrics as well as a confusion matrix over all the classes. By default, the confusion matrix is normalized. There's an option to zero out the diagonal, leaving only prediction errors (here it makes sense to turn off normalization, so you get raw error counts).
+### Misclassified
+This page contains all misclassified examples and allows filtering by specific error types.
+### Loss by Token/Label
+Show count, mean and median loss per token and label.
+### Samples by Loss
+Show every example sorted by loss (descending) for close inspection.
+### Random Samples
+Show random samples. Simple method, but it often turns up interesting things.
+### Find Duplicates
+Find potential duplicates in the data using cosine similarity.
+### Inspect
+Inspect your whole dataset, either unfiltered or by id.
+### Raw data
+See the data as seen by your model.
+### Debug
+Debug info.

html/index.md ADDED Viewed

	@@ -0,0 +1,116 @@

+---
+title: "🏷️ ExplaiNER"
+subtitle: "Error Analysis for NER models & datasets"
+---
+<div style="text-align: center">
+<img src="screenshot.jpg" alt="drawing" width="480px"/>
+</div>
+_Error Analysis is an important but often overlooked part of the data science project lifecycle, for which there is still very little tooling available. Practitioners tend to write throwaway code or, worse, skip this crucial step of understanding their models' errors altogether. This project tries to provide an extensive toolkit to probe any NER model/dataset combination, find labeling errors and understand the models' and datasets' limitations, leading the user on her way to further improvements._
+[Documentation](../doc/index.html) | [Slides](../presentation.pdf) | [Github](https://github.com/aseifert/ExplaiNER)
+## Getting started
+```bash
+# Install requirements
+pip install -r requirements.txt  # you'll need Python 3.9+
+# Run
+make run
+```
+## Description
+Some interesting **visualization techniques** contained in this project:
+* customizable visualization of neural network activation, based on the embedding layer and the feed-forward layers of the selected transformer model. ([Alammar 2021](https://aclanthology.org/2021.acl-demo.30/))
+* customizable similarity map of a 2d projection of the model's final layer's hidden states, using various algorithms (a bit like the [Tensorflow Embedding Projector](https://projector.tensorflow.org/))
+* inline HTML representation of samples with token-level prediction + labels (my own; see below under 'Samples by loss' for more info)
+**Libraries** important to this project:
+* `streamlit` for demoing (custom multi-page feature hacked in, also using session state)
+* `plotly` and `matplotlib` for charting
+* `transformers` for providing the models, and `datasets` for, well, the datasets
+* a forked, slightly modified version of [`ecco`](https://github.com/jalammar/ecco) for visualizing the neural net activations
+* `sentence_transformers` for finding potential duplicates
+* `scikit-learn` for TruncatedSVD & PCA, `umap-learn` for UMAP
+## Application Sections
+Activations
+> A group of neurons tend to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model.
+Hidden States
+> For every token in the dataset, we take its hidden state and project it onto a two-dimensional plane. Data points are colored by label/prediction, with disagreements marked by a small black border.
+>
+> Using these projections you can visually identify data points that end up in the wrong neighborhood, indicating prediction/labeling errors.
+Probing
+> A very direct and interactive way to test your model is by providing it with a list of text inputs and then inspecting the model outputs. The application features a multiline text field so the user can input multiple texts separated by newlines. For each text, the app will show a data frame containing the tokenized string, token predictions, probabilities and a visual indicator for low probability predictions -- these are the ones you should inspect first for prediction errors.
+Metrics
+> The metrics page contains precision, recall and f-score metrics as well as a confusion matrix over all the classes. By default, the confusion matrix is normalized. There's an option to zero out the diagonal, leaving only prediction errors (here it makes sense to turn off normalization, so you get raw error counts).
+>
+> With the confusion matrix, you don't want any of the classes to end up in the bottom right quarter: those are frequent but error-prone.
+Misclassified
+> This page contains all misclassified examples and allows filtering by specific error types. Helps you get an understanding of the types of errors your model makes.
+Loss by Token/Label
+> Show count, mean and median loss per token and label.
+>
+> Look out for tokens that have a big gap between mean and median, indicating systematic labeling issues.
+Samples by Loss
+> Show every example sorted by loss (descending) for close inspection.
+>
+> Apart from a (token-based) dataframe view, there's also an HTML representation of the samples, which is very information-dense but really helpful, once you got used to reading it:
+>
+> Every predicted entity (every token, really) gets a black border. The text color signifies the predicted label, with the first token of a sequence of token also showing the label's icon. If (and only if) the prediction is wrong, a small little box after the entity (token) contains the correct target class, with a background color corresponding to that class.
+>
+> For short texts, the dataframe view can be sufficient, but for longer texts the HTML view tends to be more useful.
+Random Samples
+> Show random samples. Simple method, but it often turns up interesting things.
+Find Duplicates
+> Find potential duplicates in the data using cosine similarity.
+Inspect
+> Inspect your whole dataset, either unfiltered or by id.
+Raw data
+> See the data as seen by your model.
+Debug
+> Debug info.

html/screenshot.jpg ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+https://download.pytorch.org/whl/cpu/torch-1.11.0%2Bcpu-cp39-cp39-linux_x86_64.whl
+streamlit
+pandas
+scikit-learn
+plotly
+sentence-transformers
+transformers
+tokenizers
+datasets
+numpy
+matplotlib
+seqeval
+streamlit-aggrid
+streamlit_option_menu
+pdoc
+git+https://github.com/aseifert/ecco.git@streamlit

src/__init__.py ADDED Viewed

File without changes

src/app.py ADDED Viewed

	@@ -0,0 +1,114 @@

+"""The App module is the main entry point for the application.
+    Run `streamlit run app.py` to start the app.
+"""
+import pandas as pd
+import streamlit as st
+from streamlit_option_menu import option_menu
+from src.load import load_context
+from src.subpages import (
+    DebugPage,
+    FindDuplicatesPage,
+    HomePage,
+    LossesPage,
+    LossySamplesPage,
+    MetricsPage,
+    MisclassifiedPage,
+    Page,
+    ProbingPage,
+    RandomSamplesPage,
+    RawDataPage,
+)
+from src.subpages.attention import AttentionPage
+from src.subpages.hidden_states import HiddenStatesPage
+from src.subpages.inspect import InspectPage
+from src.utils import classmap
+sts = st.sidebar
+st.set_page_config(
+    layout="wide",
+    page_title="Error Analysis",
+    page_icon="🏷️",
+)
+def _show_menu(pages: list[Page]) -> int:
+    with st.sidebar:
+        page_names = [p.name for p in pages]
+        page_icons = [p.icon for p in pages]
+        selected_menu_item = st.session_state.active_page = option_menu(
+            menu_title="ExplaiNER",
+            options=page_names,
+            icons=page_icons,
+            menu_icon="layout-wtf",
+            default_index=0,
+        )
+        return page_names.index(selected_menu_item)
+    assert False
+def _initialize_session_state(pages: list[Page]):
+    if "active_page" not in st.session_state:
+        for page in pages:
+            st.session_state.update(**page._get_widget_defaults())
+    st.session_state.update(st.session_state)
+def _write_color_legend(context):
+    def style(x):
+        return [f"background-color: {rgb}; opacity: 1;" for rgb in colors]
+    labels = list(set([lbl.split("-")[1] if "-" in lbl else lbl for lbl in context.labels]))
+    colors = [st.session_state.get(f"color_{lbl}", "#000000") for lbl in labels]
+    color_legend_df = pd.DataFrame(
+        [classmap[l] for l in labels], columns=["label"], index=labels
+    ).T
+    st.sidebar.write(
+        color_legend_df.T.style.apply(style, axis=0).set_properties(
+            **{"color": "white", "text-align": "center"}
+        )
+    )
+def main():
+    """The main entry point for the application."""
+    pages: list[Page] = [
+        HomePage(),
+        AttentionPage(),
+        HiddenStatesPage(),
+        ProbingPage(),
+        MetricsPage(),
+        LossySamplesPage(),
+        LossesPage(),
+        MisclassifiedPage(),
+        RandomSamplesPage(),
+        FindDuplicatesPage(),
+        InspectPage(),
+        RawDataPage(),
+        DebugPage(),
+    ]
+    _initialize_session_state(pages)
+    selected_page_idx = _show_menu(pages)
+    selected_page = pages[selected_page_idx]
+    if isinstance(selected_page, HomePage):
+        selected_page.render()
+        return
+    if "model_name" not in st.session_state:
+        # this can happen if someone loads another page directly (without going through home)
+        st.error("Setup not complete. Please click on 'Home / Setup in left menu bar'")
+        return
+    context = load_context(**st.session_state)
+    _write_color_legend(context)
+    selected_page.render(context)
+if __name__ == "__main__":
+    main()

src/data.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from functools import partial
+import pandas as pd
+import streamlit as st
+import torch
+from datasets import Dataset, DatasetDict, load_dataset  # type: ignore
+from torch.nn.functional import cross_entropy
+from transformers import DataCollatorForTokenClassification  # type: ignore
+from src.utils import device, tokenizer_hash_funcs
+@st.cache(allow_output_mutation=True)
+def get_data(
+    ds_name: str, config_name: str, split_name: str, split_sample_size: int, randomize_sample: bool
+) -> Dataset:
+    """Loads a Dataset from the HuggingFace hub (if not already loaded).
+    Uses `datasets.load_dataset` to load the dataset (see its documentation for additional details).
+    Args:
+        ds_name (str): Path or name of the dataset.
+        config_name (str): Name of the dataset configuration.
+        split_name (str): Which split of the data to load.
+        split_sample_size (int): The number of examples to load from the split.
+    Returns:
+        Dataset: A Dataset object.
+    """
+    ds: DatasetDict = load_dataset(ds_name, name=config_name, use_auth_token=True).shuffle(
+        seed=0 if randomize_sample else None
+    )  # type: ignore
+    split = ds[split_name].select(range(split_sample_size))
+    return split
+@st.cache(
+    allow_output_mutation=True,
+    hash_funcs=tokenizer_hash_funcs,
+)
+def get_collator(tokenizer) -> DataCollatorForTokenClassification:
+    """Returns a DataCollator that will dynamically pad the inputs received, as well as the labels.
+    Args:
+        tokenizer ([PreTrainedTokenizer] or [PreTrainedTokenizerFast]): The tokenizer used for encoding the data.
+    Returns:
+        DataCollatorForTokenClassification: The DataCollatorForTokenClassification object.
+    """
+    return DataCollatorForTokenClassification(tokenizer)
+def create_word_ids_from_input_ids(tokenizer, input_ids: list[int]) -> list[int]:
+    """Takes a list of input_ids and return corresponding word_ids
+    Args:
+        tokenizer: The tokenizer that was used to obtain the input ids.
+        input_ids (list[int]): List of token ids.
+    Returns:
+        list[int]: Word ids corresponding to the input ids.
+    """
+    word_ids = []
+    wid = -1
+    tokens = [tokenizer.convert_ids_to_tokens(i) for i in input_ids]
+    for i, tok in enumerate(tokens):
+        if tok in tokenizer.all_special_tokens:
+            word_ids.append(-1)
+            continue
+        if not tokens[i - 1].endswith("@@") and tokens[i - 1] != "<unk>":
+            wid += 1
+        word_ids.append(wid)
+    assert len(word_ids) == len(input_ids)
+    return word_ids
+def tokenize(batch, tokenizer) -> dict:
+    """Tokenizes a batch of examples.
+    Args:
+        batch: The examples to tokenize
+        tokenizer: The tokenizer to use
+    Returns:
+        dict: The tokenized batch
+    """
+    tokenized_inputs = tokenizer(batch["tokens"], truncation=True, is_split_into_words=True)
+    labels = []
+    wids = []
+    for idx, label in enumerate(batch["ner_tags"]):
+        try:
+            word_ids = tokenized_inputs.word_ids(batch_index=idx)
+        except ValueError:
+            word_ids = create_word_ids_from_input_ids(
+                tokenizer, tokenized_inputs["input_ids"][idx]
+            )
+        previous_word_idx = None
+        label_ids = []
+        for word_idx in word_ids:
+            if word_idx == -1 or word_idx is None or word_idx == previous_word_idx:
+                label_ids.append(-100)
+            else:
+                label_ids.append(label[word_idx])
+            previous_word_idx = word_idx
+        wids.append(word_ids)
+        labels.append(label_ids)
+    tokenized_inputs["word_ids"] = wids
+    tokenized_inputs["labels"] = labels
+    return tokenized_inputs
+def stringify_ner_tags(batch: dict, tags) -> dict:
+    """Stringifies a dataset batch's NER tags."""
+    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}
+def encode_dataset(split: Dataset, tokenizer):
+    """Encodes a dataset split.
+    Args:
+        split (Dataset): A Dataset object.
+        tokenizer: A PreTrainedTokenizer object.
+    Returns:
+        Dataset: A Dataset object with the encoded inputs.
+    """
+    tags = split.features["ner_tags"].feature
+    split = split.map(partial(stringify_ner_tags, tags=tags), batched=True)
+    remove_columns = split.column_names
+    ids = split["id"]
+    split = split.map(
+        partial(tokenize, tokenizer=tokenizer),
+        batched=True,
+        remove_columns=remove_columns,
+    )
+    word_ids = [[id if id is not None else -1 for id in wids] for wids in split["word_ids"]]
+    return split.remove_columns(["word_ids"]), word_ids, ids
+def forward_pass_with_label(batch, model, collator, num_classes: int) -> dict:
+    """Runs the forward pass for a batch of examples.
+    Args:
+        batch: The batch to process
+        model: The model to process the batch with
+        collator: A data collator
+        num_classes (int): Number of classes
+    Returns:
+        dict: a dictionary containing `losses`, `preds` and `hidden_states`
+    """
+    # Convert dict of lists to list of dicts suitable for data collator
+    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
+    # Pad inputs and labels and put all tensors on device
+    batch = collator(features)
+    input_ids = batch["input_ids"].to(device)
+    attention_mask = batch["attention_mask"].to(device)
+    labels = batch["labels"].to(device)
+    with torch.no_grad():
+        # Pass data through model
+        output = model(input_ids, attention_mask, output_hidden_states=True)
+        # logit.size: [batch_size, sequence_length, classes]
+        # Predict class with largest logit value on classes axis
+        preds = torch.argmax(output.logits, axis=-1).cpu().numpy()  # type: ignore
+        # Calculate loss per token after flattening batch dimension with view
+        loss = cross_entropy(
+            output.logits.view(-1, num_classes), labels.view(-1), reduction="none"
+        )
+        # Unflatten batch dimension and convert to numpy array
+        loss = loss.view(len(input_ids), -1).cpu().numpy()
+        hidden_states = output.hidden_states[-1].cpu().numpy()
+        # logits = output.logits.view(len(input_ids), -1).cpu().numpy()
+    return {"losses": loss, "preds": preds, "hidden_states": hidden_states}
+def predict(split_encoded: Dataset, model, tokenizer, collator, tags) -> pd.DataFrame:
+    """Generates predictions for a given dataset split and returns the results as a dataframe.
+    Args:
+        split_encoded (Dataset): The dataset to process
+        model: The model to process the dataset with
+        tokenizer: The tokenizer to process the dataset with
+        collator: The data collator to use
+        tags: The tags used in the dataset
+    Returns:
+        pd.DataFrame: A dataframe containing token-level predictions.
+    """
+    split_encoded = split_encoded.map(
+        partial(
+            forward_pass_with_label,
+            model=model,
+            collator=collator,
+            num_classes=tags.num_classes,
+        ),
+        batched=True,
+        batch_size=8,
+    )
+    df: pd.DataFrame = split_encoded.to_pandas()  # type: ignore
+    df["tokens"] = df["input_ids"].apply(
+        lambda x: tokenizer.convert_ids_to_tokens(x)  # type: ignore
+    )
+    df["labels"] = df["labels"].apply(
+        lambda x: ["IGN" if i == -100 else tags.int2str(int(i)) for i in x]
+    )
+    df["preds"] = df["preds"].apply(lambda x: [model.config.id2label[i] for i in x])
+    df["preds"] = df.apply(lambda x: x["preds"][: len(x["input_ids"])], axis=1)
+    df["losses"] = df.apply(lambda x: x["losses"][: len(x["input_ids"])], axis=1)
+    df["hidden_states"] = df.apply(lambda x: x["hidden_states"][: len(x["input_ids"])], axis=1)
+    df["total_loss"] = df["losses"].apply(sum)
+    return df

src/load.py ADDED Viewed

	@@ -0,0 +1,101 @@

+from typing import Optional
+import pandas as pd
+import streamlit as st
+from datasets import Dataset  # type: ignore
+from src.data import encode_dataset, get_collator, get_data, predict
+from src.model import get_encoder, get_model, get_tokenizer
+from src.subpages import Context
+from src.utils import align_sample, device, explode_df
+_TOKENIZER_NAME = (
+    "xlm-roberta-base",
+    "gagan3012/bert-tiny-finetuned-ner",
+    "distilbert-base-german-cased",
+)[0]
+def _load_models_and_tokenizer(
+    encoder_model_name: str,
+    model_name: str,
+    tokenizer_name: Optional[str],
+    device: str = "cpu",
+):
+    sentence_encoder = get_encoder(encoder_model_name, device=device)
+    tokenizer = get_tokenizer(tokenizer_name if tokenizer_name else model_name)
+    labels = "O B-COMMA".split() if "comma" in model_name else None
+    model = get_model(model_name, labels=labels)
+    return sentence_encoder, model, tokenizer
+@st.cache(allow_output_mutation=True)
+def load_context(
+    encoder_model_name: str,
+    model_name: str,
+    ds_name: str,
+    ds_config_name: str,
+    ds_split_name: str,
+    split_sample_size: int,
+    randomize_sample: bool,
+    **kw_args,
+) -> Context:
+    """Utility method loading (almost) everything we need for the application.
+    This exists just because we want to cache the results of this function.
+    Args:
+        encoder_model_name (str): Name of the sentence encoder to load.
+        model_name (str): Name of the NER model to load.
+        ds_name (str): Dataset name or path.
+        ds_config_name (str): Dataset config name.
+        ds_split_name (str): Dataset split name.
+        split_sample_size (int): Number of examples to load from the split.
+    Returns:
+        Context: An object containing everything we need for the application.
+    """
+    sentence_encoder, model, tokenizer = _load_models_and_tokenizer(
+        encoder_model_name=encoder_model_name,
+        model_name=model_name,
+        tokenizer_name=_TOKENIZER_NAME if "comma" in model_name else None,
+        device=str(device),
+    )
+    collator = get_collator(tokenizer)
+    # load data related stuff
+    split: Dataset = get_data(
+        ds_name, ds_config_name, ds_split_name, split_sample_size, randomize_sample
+    )
+    tags = split.features["ner_tags"].feature
+    split_encoded, word_ids, ids = encode_dataset(split, tokenizer)
+    # transform into dataframe
+    df = predict(split_encoded, model, tokenizer, collator, tags)
+    df["word_ids"] = word_ids
+    df["ids"] = ids
+    # explode, clean, merge
+    df_tokens = explode_df(df)
+    df_tokens_cleaned = df_tokens.query("labels != 'IGN'")
+    df_merged = pd.DataFrame(df.apply(align_sample, axis=1).tolist())
+    df_tokens_merged = explode_df(df_merged)
+    return Context(
+        **{
+            "model": model,
+            "tokenizer": tokenizer,
+            "sentence_encoder": sentence_encoder,
+            "df": df,
+            "df_tokens": df_tokens,
+            "df_tokens_cleaned": df_tokens_cleaned,
+            "df_tokens_merged": df_tokens_merged,
+            "tags": tags,
+            "labels": tags.names,
+            "split_sample_size": split_sample_size,
+            "ds_name": ds_name,
+            "ds_config_name": ds_config_name,
+            "ds_split_name": ds_split_name,
+            "split": split,
+        }
+    )

src/model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import streamlit as st
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForTokenClassification  # type: ignore
+from transformers import AutoTokenizer  # type: ignore
+@st.experimental_singleton()
+def get_model(model_name: str, labels=None):
+    if labels is None:
+        return AutoModelForTokenClassification.from_pretrained(
+            model_name,
+            output_attentions=True,
+        )  # type: ignore
+    else:
+        id2label = {idx: tag for idx, tag in enumerate(labels)}
+        label2id = {tag: idx for idx, tag in enumerate(labels)}
+        return AutoModelForTokenClassification.from_pretrained(
+            model_name,
+            output_attentions=True,
+            num_labels=len(labels),
+            id2label=id2label,
+            label2id=label2id,
+        )  # type: ignore
+@st.experimental_singleton()
+def get_encoder(model_name: str, device: str = "cpu"):
+    return SentenceTransformer(model_name, device=device)
+@st.experimental_singleton()
+def get_tokenizer(tokenizer_name: str):
+    return AutoTokenizer.from_pretrained(tokenizer_name)

src/subpages/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from src.subpages.attention import AttentionPage
+from src.subpages.debug import DebugPage
+from src.subpages.find_duplicates import FindDuplicatesPage
+from src.subpages.hidden_states import HiddenStatesPage
+from src.subpages.home import HomePage
+from src.subpages.inspect import InspectPage
+from src.subpages.losses import LossesPage
+from src.subpages.lossy_samples import LossySamplesPage
+from src.subpages.metrics import MetricsPage
+from src.subpages.misclassified import MisclassifiedPage
+from src.subpages.page import Context, Page
+from src.subpages.probing import ProbingPage
+from src.subpages.random_samples import RandomSamplesPage
+from src.subpages.raw_data import RawDataPage

src/subpages/attention.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+A group of neurons tend to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model.
+"""
+import ecco
+import streamlit as st
+from streamlit.components.v1 import html
+from src.subpages.page import Context, Page  # type: ignore
+_SETUP_HTML = """
+<script src="https://requirejs.org/docs/release/2.3.6/minified/require.js"></script>
+<script>
+    var ecco_url = 'https://storage.googleapis.com/ml-intro/ecco/'
+    //var ecco_url = 'http://localhost:8000/'
+    if (window.ecco === undefined) window.ecco = {}
+    // Setup the paths of the script we'll be using
+    requirejs.config({
+        urlArgs: "bust=" + (new Date()).getTime(),
+        nodeRequire: require,
+        paths: {
+            d3: "https://d3js.org/d3.v6.min", // This is only for use in setup.html and basic.html
+            "d3-array": "https://d3js.org/d3-array.v2.min",
+            jquery: "https://code.jquery.com/jquery-3.5.1.min",
+            ecco: ecco_url + 'js/0.0.6/ecco-bundle.min',
+            xregexp: 'https://cdnjs.cloudflare.com/ajax/libs/xregexp/3.2.0/xregexp-all.min'
+        }
+    });
+    // Add the css file
+    //requirejs(['d3'],
+    //    function (d3) {
+    //        d3.select('#css').attr('href', ecco_url + 'html/styles.css')
+    //    })
+    console.log('Ecco initialize!!')
+    // returns a 'basic' object. basic.init() selects the html div we'll be
+    // rendering the html into, adds styles.css to the document.
+    define('basic', ['d3'],
+        function (d3) {
+            return {
+                init: function (viz_id = null) {
+                    if (viz_id == null) {
+                        viz_id = "viz_" + Math.round(Math.random() * 10000000)
+                    }
+                    // Select the div rendered below, change its id
+                    const div = d3.select('#basic').attr('id', viz_id),
+                        div_parent = d3.select('#' + viz_id).node().parentNode
+                    // Link to CSS file
+                    d3.select(div_parent).insert('link')
+                        .attr('rel', 'stylesheet')
+                        .attr('type', 'text/css')
+                        .attr('href', ecco_url + 'html/0.0.2/styles.css')
+                    return viz_id
+                }
+            }
+        }, function (err) {
+            console.log(err);
+        }
+    )
+</script>
+<head>
+    <link id='css' rel="stylesheet" type="text/css">
+</head>
+<div id="basic"></div>
+"""
+@st.cache(allow_output_mutation=True)
+def _load_ecco_model():
+    model_config = {
+        "embedding": "embeddings.word_embeddings",
+        "type": "mlm",
+        "activations": [r"ffn\.lin1"],
+        "token_prefix": "",
+        "partial_token_prefix": "##",
+    }
+    return ecco.from_pretrained(
+        "elastic/distilbert-base-uncased-finetuned-conll03-english",
+        model_config=model_config,
+        activations=True,
+    )
+class AttentionPage(Page):
+    name = "Activations"
+    icon = "activity"
+    def _get_widget_defaults(self):
+        return {
+            "act_n_components": 8,
+            "act_default_text": """Now I ask you: what can be expected of man since he is a being endowed with strange qualities? Shower upon him every earthly blessing, drown him in a sea of happiness, so that nothing but bubbles of bliss can be seen on the surface; give him economic prosperity, such that he should have nothing else to do but sleep, eat cakes and busy himself with the continuation of his species, and even then out of sheer ingratitude, sheer spite, man would play you some nasty trick. He would even risk his cakes and would deliberately desire the most fatal rubbish, the most uneconomical absurdity, simply to introduce into all this positive good sense his fatal fantastic element. It is just his fantastic dreams, his vulgar folly that he will desire to retain, simply in order to prove to himself--as though that were so necessary-- that men still are men and not the keys of a piano, which the laws of nature threaten to control so completely that soon one will be able to desire nothing but by the calendar. And that is not all: even if man really were nothing but a piano-key, even if this were proved to him by natural science and mathematics, even then he would not become reasonable, but would purposely do something perverse out of simple ingratitude, simply to gain his point. And if he does not find means he will contrive destruction and chaos, will contrive sufferings of all sorts, only to gain his point! He will launch a curse upon the world, and as only man can curse (it is his privilege, the primary distinction between him and other animals), may be by his curse alone he will attain his object--that is, convince himself that he is a man and not a piano-key!""",
+            "act_from_layer": 0,
+            "act_to_layer": 5,
+        }
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("ℹ️", expanded=True):
+            st.write(
+                "A group of neurons tend to fire in response to commas and other punctuation. Other groups of neurons tend to fire in response to pronouns. Use this visualization to factorize neuron activity in individual FFNN layers or in the entire model."
+            )
+        lm = _load_ecco_model()
+        col1, _, col2 = st.columns([1.5, 0.5, 4])
+        with col1:
+            st.subheader("Settings")
+            n_components = st.slider(
+                "#components",
+                key="act_n_components",
+                min_value=2,
+                max_value=10,
+                step=1,
+            )
+            from_layer = st.slider(
+                "from layer",
+                key="act_from_layer",
+                value=0,
+                min_value=0,
+                max_value=len(lm.model.transformer.layer) - 1,
+                step=1,
+            )
+            to_layer = (
+                st.slider(
+                    "to layer",
+                    key="act_to_layer",
+                    value=0,
+                    min_value=0,
+                    max_value=len(lm.model.transformer.layer) - 1,
+                    step=1,
+                )
+                + 1
+            )
+        if to_layer <= from_layer:
+            st.error("to_layer must be >= from_layer")
+            st.stop()
+        with col2:
+            st.subheader("–")
+            text = st.text_area("Text", key="act_default_text", height=240)
+        inputs = lm.tokenizer([text], return_tensors="pt")
+        output = lm(inputs)
+        nmf = output.run_nmf(n_components=n_components, from_layer=from_layer, to_layer=to_layer)
+        data = nmf.explore(returnData=True)
+        _JS_TEMPLATE = f"""<script>requirejs(['basic', 'ecco'], function(basic, ecco){{
+            const viz_id = basic.init()
+            ecco.interactiveTokensAndFactorSparklines(viz_id, {data}, {{ 'hltrCFG': {{'tokenization_config': {{'token_prefix': '', 'partial_token_prefix': '##'}} }} }})
+        }}, function (err) {{
+            console.log(err);
+        }})</script>"""
+        html(_SETUP_HTML + _JS_TEMPLATE, height=800, scrolling=True)

src/subpages/debug.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import streamlit as st
+from pip._internal.operations import freeze
+from src.subpages.page import Context, Page
+class DebugPage(Page):
+    name = "Debug"
+    icon = "bug"
+    def render(self, context: Context):
+        st.title(self.name)
+        # with st.expander("💡", expanded=True):
+        #     st.write("Some debug info.")
+        st.subheader("Installed Packages")
+        # get output of pip freeze from system
+        with st.expander("pip freeze"):
+            st.code("\n".join(freeze.freeze()))
+        st.subheader("Streamlit Session State")
+        st.json(st.session_state)
+        st.subheader("Tokenizer")
+        st.code(context.tokenizer)
+        st.subheader("Model")
+        st.code(context.model.config)
+        st.code(context.model)

src/subpages/emoji-en-US.json ADDED Viewed

The diff for this file is too large to render. See raw diff

src/subpages/faiss.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import streamlit as st
+from datasets import Dataset
+from src.subpages.page import Context, Page  # type: ignore
+from src.utils import device, explode_df, htmlify_labeled_example, tag_text
+class FaissPage(Page):
+    name = "Bla"
+    icon = "x-octagon"
+    def render(self, context: Context):
+        dd = Dataset.from_pandas(context.df_tokens_merged, preserve_index=False)  # type: ignore
+        dd.add_faiss_index(column="hidden_states", index_name="token_index")
+        token_id, text = (
+            6,
+            "Die Wissenschaft ist eine wichtige Grundlage für die Entwicklung von neuen Technologien.",
+        )
+        token_id, text = (
+            15,
+            "Außer der unbewussten Beeinflussung eines Resultats gibt es auch noch andere Motive die das reine strahlende Licht der Wissenschaft etwas zu trüben vermögen.",
+        )
+        token_id, text = (
+            3,
+            "Mit mehr Instrumenten einer besseren präziseren Datenbasis ist auch ein viel besseres smarteres Risikomanagement möglich.",
+        )
+        token_id, text = (
+            7,
+            "Es gilt die akademische Viertelstunde das heißt Beginn ist fünfzehn Minuten später.",
+        )
+        token_id, text = (
+            7,
+            "Damit einher geht übrigens auch dass Marcella Collocinis Tochter keine wie auch immer geartete strafrechtliche Verfolgung zu befürchten hat.",
+        )
+        token_id, text = (
+            16,
+            "After Steve Jobs met with Bill Gates of Microsoft back in 1993, they went to Cupertino and made the deal.",
+        )
+        tagged = tag_text(text, context.tokenizer, context.model, device)
+        hidden_states = tagged["hidden_states"]
+        # tagged.drop("hidden_states", inplace=True, axis=1)
+        # hidden_states_vec = svd.transform([hidden_states[token_id]])[0].astype(np.float32)
+        hidden_states_vec = hidden_states[token_id]
+        tagged = tagged.astype(str)
+        tagged["probs"] = tagged["probs"].apply(lambda x: x[:-2])
+        tagged["check"] = tagged["probs"].apply(
+            lambda x: "✅ ✅" if int(x) < 100 else "✅" if int(x) < 1000 else ""
+        )
+        st.dataframe(tagged.drop("hidden_states", axis=1).T)
+        results = dd.get_nearest_examples("token_index", hidden_states_vec, k=10)
+        for i, (dist, idx, token) in enumerate(
+            zip(results.scores, results.examples["ids"], results.examples["tokens"])
+        ):
+            st.code(f"{dist:.3f} {token}")
+            sample = context.df_tokens_merged.query(f"ids == '{idx}'")
+            st.write(f"[{i};{idx}] " + htmlify_labeled_example(sample), unsafe_allow_html=True)

src/subpages/find_duplicates.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""Find potential duplicates in the data using cosine similarity."""
+import streamlit as st
+from sentence_transformers.util import cos_sim
+from src.subpages.page import Context, Page
+@st.cache()
+def get_sims(texts: list[str], sentence_encoder):
+    embeddings = sentence_encoder.encode(texts, batch_size=8, convert_to_numpy=True)
+    return cos_sim(embeddings, embeddings)
+class FindDuplicatesPage(Page):
+    name = "Find Duplicates"
+    icon = "fingerprint"
+    def _get_widget_defaults(self):
+        return {
+            "cutoff": 0.95,
+        }
+    def render(self, context: Context):
+        st.title("Find Duplicates")
+        with st.expander("💡", expanded=True):
+            st.write("Find potential duplicates in the data using cosine similarity.")
+        cutoff = st.slider("Similarity threshold", min_value=0.0, max_value=1.0, key="cutoff")
+        # split.add_faiss_index(column="embeddings", index_name="sent_index")
+        # st.write("Index is ready")
+        # sentence_encoder.encode(["hello world"], batch_size=8)
+        # st.write(split["tokens"][0])
+        texts = [" ".join(ts) for ts in context.split["tokens"]]
+        sims = get_sims(texts, context.sentence_encoder)
+        candidates = []
+        for i in range(len(sims)):
+            for j in range(i + 1, len(sims)):
+                if sims[i][j] >= cutoff:
+                    candidates.append((sims[i][j], i, j))
+        candidates.sort(reverse=False)
+        for (sim, i, j) in candidates[:100]:
+            st.markdown(f"**Possible duplicate ({i}, {j}, sim: {sim:.3f}):**")
+            st.markdown("* " + " ".join(context.split["tokens"][i]))
+            st.markdown("* " + " ".join(context.split["tokens"][j]))
+        # st.write("queries")
+        # results = split.get_nearest_examples("sent_index", np.array(split["embeddings"][0], dtype=np.float32), k=2)
+        # results = split.get_nearest_examples_batch("sent_index", queries, k=2)
+        # st.write(results.total_examples[0]["id"][1])
+        # st.write(results.total_examples[0])

src/subpages/hidden_states.py ADDED Viewed

	@@ -0,0 +1,194 @@

+"""
+For every token in the dataset, we take its hidden state and project it onto a two-dimensional plane. Data points are colored by label/prediction, with disagreements marked by a small black border.
+"""
+import numpy as np
+import plotly.express as px
+import plotly.graph_objects as go
+import streamlit as st
+from src.subpages.page import Context, Page
+@st.cache
+def reduce_dim_svd(X, n_iter: int, random_state=42):
+    """Dimensionality reduction using truncated SVD (aka LSA).
+    This transformer performs linear dimensionality reduction by means of truncated singular value decomposition (SVD). Contrary to PCA, this estimator does not center the data before computing the singular value decomposition. This means it can work with sparse matrices efficiently.
+        Args:
+            X: Training data
+            n_iter (int): Desired dimensionality of output data. Must be strictly less than the number of features.
+            random_state (int, optional): Used during randomized svd. Pass an int for reproducible results across multiple function calls. Defaults to 42.
+        Returns:
+            ndarray: Reduced version of X, ndarray of shape (n_samples, 2).
+    """
+    from sklearn.decomposition import TruncatedSVD
+    svd = TruncatedSVD(n_components=2, n_iter=n_iter, random_state=random_state)
+    return svd.fit_transform(X)
+@st.cache
+def reduce_dim_pca(X, random_state=42):
+    """Principal component analysis (PCA).
+    Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.
+        Args:
+            X: Training data
+            random_state (int, optional): Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls.
+        Returns:
+            ndarray: Reduced version of X, ndarray of shape (n_samples, 2).
+    """
+    from sklearn.decomposition import PCA
+    return PCA(n_components=2, random_state=random_state).fit_transform(X)
+@st.cache
+def reduce_dim_umap(X, n_neighbors=5, min_dist=0.1, metric="euclidean"):
+    """Uniform Manifold Approximation and Projection
+    Finds a low dimensional embedding of the data that approximates an underlying manifold.
+        Args:
+            X: Training data
+            n_neighbors (int, optional): The size of local neighborhood (in terms of number of neighboring sample points) used for manifold approximation. Larger values result in more global views of the manifold, while smaller values result in more local data being preserved. In general values should be in the range 2 to 100. Defaults to 5.
+            min_dist (float, optional): The effective minimum distance between embedded points. Smaller values will result in a more clustered/clumped embedding where nearby points on the manifold are drawn closer together, while larger values will result on a more even dispersal of points. The value should be set relative to the `spread` value, which determines the scale at which embedded points will be spread out. Defaults to 0.1.
+            metric (str, optional): The metric to use to compute distances in high dimensional space (see UMAP docs for options). Defaults to "euclidean".
+        Returns:
+            ndarray: Reduced version of X, ndarray of shape (n_samples, 2).
+    """
+    from umap import UMAP
+    return UMAP(n_neighbors=n_neighbors, min_dist=min_dist, metric=metric).fit_transform(X)
+class HiddenStatesPage(Page):
+    name = "Hidden States"
+    icon = "grid-3x3"
+    def _get_widget_defaults(self):
+        return {
+            "n_tokens": 1_000,
+            "svd_n_iter": 5,
+            "svd_random_state": 42,
+            "umap_n_neighbors": 15,
+            "umap_metric": "euclidean",
+            "umap_min_dist": 0.1,
+        }
+    def render(self, context: Context):
+        st.title("Embeddings")
+        with st.expander("💡", expanded=True):
+            st.write(
+                "For every token in the dataset, we take its hidden state and project it onto a two-dimensional plane. Data points are colored by label/prediction, with disagreements signified by a small black border."
+            )
+        col1, _, col2 = st.columns([9 / 32, 1 / 32, 22 / 32])
+        df = context.df_tokens_merged.copy()
+        dim_algo = "SVD"
+        n_tokens = 100
+        with col1:
+            st.subheader("Settings")
+            n_tokens = st.slider(
+                "#tokens",
+                key="n_tokens",
+                min_value=100,
+                max_value=len(df["tokens"].unique()),
+                step=100,
+            )
+            dim_algo = st.selectbox("Dimensionality reduction algorithm", ["SVD", "PCA", "UMAP"])
+            if dim_algo == "SVD":
+                svd_n_iter = st.slider(
+                    "#iterations",
+                    key="svd_n_iter",
+                    min_value=1,
+                    max_value=10,
+                    step=1,
+                )
+            elif dim_algo == "UMAP":
+                umap_n_neighbors = st.slider(
+                    "#neighbors",
+                    key="umap_n_neighbors",
+                    min_value=2,
+                    max_value=100,
+                    step=1,
+                )
+                umap_min_dist = st.number_input(
+                    "Min distance", key="umap_min_dist", value=0.1, min_value=0.0, max_value=1.0
+                )
+                umap_metric = st.selectbox(
+                    "Metric", ["euclidean", "manhattan", "chebyshev", "minkowski"]
+                )
+            else:
+                pass
+        with col2:
+            sents = df.groupby("ids").apply(lambda x: " ".join(x["tokens"].tolist()))
+            X = np.array(df["hidden_states"].tolist())
+            transformed_hidden_states = None
+            if dim_algo == "SVD":
+                transformed_hidden_states = reduce_dim_svd(X, n_iter=svd_n_iter)  # type: ignore
+            elif dim_algo == "PCA":
+                transformed_hidden_states = reduce_dim_pca(X)
+            elif dim_algo == "UMAP":
+                transformed_hidden_states = reduce_dim_umap(
+                    X, n_neighbors=umap_n_neighbors, min_dist=umap_min_dist, metric=umap_metric  # type: ignore
+                )
+            assert isinstance(transformed_hidden_states, np.ndarray)
+            df["x"] = transformed_hidden_states[:, 0]
+            df["y"] = transformed_hidden_states[:, 1]
+            df["sent0"] = df["ids"].map(lambda x: " ".join(sents[x][0:50].split()))
+            df["sent1"] = df["ids"].map(lambda x: " ".join(sents[x][50:100].split()))
+            df["sent2"] = df["ids"].map(lambda x: " ".join(sents[x][100:150].split()))
+            df["sent3"] = df["ids"].map(lambda x: " ".join(sents[x][150:200].split()))
+            df["sent4"] = df["ids"].map(lambda x: " ".join(sents[x][200:250].split()))
+            df["disagreements"] = df["labels"] != df["preds"]
+            subset = df[:n_tokens]
+            disagreements_trace = go.Scatter(
+                x=subset[subset["disagreements"]]["x"],
+                y=subset[subset["disagreements"]]["y"],
+                mode="markers",
+                marker=dict(
+                    size=6,
+                    color="rgba(0,0,0,0)",
+                    line=dict(width=1),
+                ),
+                hoverinfo="skip",
+            )
+            st.subheader("Projection Results")
+            fig = px.scatter(
+                subset,
+                x="x",
+                y="y",
+                color="labels",
+                hover_data=["ids", "preds", "sent0", "sent1", "sent2", "sent3", "sent4"],
+                hover_name="tokens",
+                title="Colored by label",
+            )
+            fig.add_trace(disagreements_trace)
+            st.plotly_chart(fig)
+            fig = px.scatter(
+                subset,
+                x="x",
+                y="y",
+                color="preds",
+                hover_data=["ids", "labels", "sent0", "sent1", "sent2", "sent3", "sent4"],
+                hover_name="tokens",
+                title="Colored by prediction",
+            )
+            fig.add_trace(disagreements_trace)
+            st.plotly_chart(fig)

src/subpages/home.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import json
+import random
+from typing import Optional
+import streamlit as st
+from src.data import get_data
+from src.subpages.page import Context, Page
+from src.utils import PROJ, classmap, color_map_color
+_SENTENCE_ENCODER_MODEL = (
+    "sentence-transformers/all-MiniLM-L6-v2",
+    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+)[0]
+_MODEL_NAME = (
+    "elastic/distilbert-base-uncased-finetuned-conll03-english",
+    "gagan3012/bert-tiny-finetuned-ner",
+    "socialmediaie/bertweet-base_wnut17_ner",
+    "sberbank-ai/bert-base-NER-reptile-5-datasets",
+    "aseifert/comma-xlm-roberta-base",
+    "dslim/bert-base-NER",
+    "aseifert/distilbert-base-german-cased-comma-derstandard",
+)[0]
+_DATASET_NAME = (
+    "conll2003",
+    "wnut_17",
+    "aseifert/comma",
+)[0]
+_CONFIG_NAME = (
+    "conll2003",
+    "wnut_17",
+    "seifertverlag",
+)[0]
+class HomePage(Page):
+    name = "Home / Setup"
+    icon = "house"
+    def _get_widget_defaults(self):
+        return {
+            "encoder_model_name": _SENTENCE_ENCODER_MODEL,
+            "model_name": _MODEL_NAME,
+            "ds_name": _DATASET_NAME,
+            "ds_split_name": "validation",
+            "ds_config_name": _CONFIG_NAME,
+            "split_sample_size": 512,
+            "randomize_sample": True,
+        }
+    def render(self, context: Optional[Context] = None):
+        st.title("ExplaiNER")
+        with st.expander("💡", expanded=True):
+            st.write(
+                "**Error Analysis is an important but often overlooked part of the data science project lifecycle**, for which there is still very little tooling available. Practitioners tend to write throwaway code or, worse, skip this crucial step of understanding their models' errors altogether. This project tries to provide an **extensive toolkit to probe any NER model/dataset combination**, find labeling errors and understand the models' and datasets' limitations, leading the user on her way to further **improving both model AND dataset**."
+            )
+            st.write(
+                "**Note:** This Space requires a fair amount of computation, so please be patient with the loading animations. 🙏 I am caching as much as possible, so after the first wait most things should be precomputed."
+            )
+            st.write(
+                "_Caveat: Even though everything is customizable here, I haven't tested this app much with different models/datasets._"
+            )
+        col1, _, col2a, col2b = st.columns([0.8, 0.05, 0.15, 0.15])
+        with col1:
+            random_form_key = f"settings-{random.randint(0, 100000)}"
+            # FIXME: for some reason I'm getting the following error if I don't randomize the key:
+            """
+                2022-05-05 20:37:16.507 Traceback (most recent call last):
+            File "/Users/zoro/mambaforge/lib/python3.9/site-packages/streamlit/scriptrunner/script_runner.py", line 443, in _run_script
+                exec(code, module.__dict__)
+            File "/Users/zoro/code/error-analysis/main.py", line 162, in <module>
+                main()
+            File "/Users/zoro/code/error-analysis/main.py", line 102, in main
+                show_setup()
+            File "/Users/zoro/code/error-analysis/section/setup.py", line 68, in show_setup
+                st.form_submit_button("Load Model & Data")
+            File "/Users/zoro/mambaforge/lib/python3.9/site-packages/streamlit/elements/form.py", line 240, in form_submit_button
+                return self._form_submit_button(
+            File "/Users/zoro/mambaforge/lib/python3.9/site-packages/streamlit/elements/form.py", line 260, in _form_submit_button
+                return self.dg._button(
+            File "/Users/zoro/mambaforge/lib/python3.9/site-packages/streamlit/elements/button.py", line 304, in _button
+                check_session_state_rules(default_value=None, key=key, writes_allowed=False)
+            File "/Users/zoro/mambaforge/lib/python3.9/site-packages/streamlit/elements/utils.py", line 74, in check_session_state_rules
+                raise StreamlitAPIException(
+            streamlit.errors.StreamlitAPIException: Values for st.button, st.download_button, st.file_uploader, and st.form cannot be set using st.session_state.
+            """
+            with st.form(key=random_form_key):
+                st.subheader("Model & Data Selection")
+                st.text_input(
+                    label="NER Model:",
+                    key="model_name",
+                    help="Path or name of the model to use",
+                )
+                st.text_input(
+                    label="Encoder Model:",
+                    key="encoder_model_name",
+                    help="Path or name of the encoder to use for duplicate detection",
+                )
+                ds_name = st.text_input(
+                    label="Dataset:",
+                    key="ds_name",
+                    help="Path or name of the dataset to use",
+                )
+                ds_config_name = st.text_input(
+                    label="Config (optional):",
+                    key="ds_config_name",
+                )
+                ds_split_name = st.selectbox(
+                    label="Split:",
+                    options=["train", "validation", "test"],
+                    key="ds_split_name",
+                )
+                split_sample_size = st.number_input(
+                    "Sample size:",
+                    step=16,
+                    key="split_sample_size",
+                    help="Sample size for the split, speeds up processing inside streamlit",
+                )
+                randomize_sample = st.checkbox(
+                    "Randomize sample",
+                    key="randomize_sample",
+                    help="Whether to randomize the sample",
+                )
+                # breakpoint()
+                # st.form_submit_button("Submit")
+                st.form_submit_button("Load Model & Data")
+        split = get_data(
+            ds_name, ds_config_name, ds_split_name, split_sample_size, randomize_sample  # type: ignore
+        )
+        labels = list(
+            set([n.split("-")[1] for n in split.features["ner_tags"].feature.names if n != "O"])
+        )
+        with col2a:
+            st.subheader("Classes")
+            st.write("**Color**")
+            colors = {label: color_map_color(i / len(labels)) for i, label in enumerate(labels)}
+            for label in labels:
+                if f"color_{label}" not in st.session_state:
+                    st.session_state[f"color_{label}"] = colors[label]
+                st.color_picker(label, key=f"color_{label}")
+        with col2b:
+            st.subheader("—")
+            st.write("**Icon**")
+            emojis = list(json.load(open(PROJ / "subpages/emoji-en-US.json")).keys())
+            for label in labels:
+                if f"icon_{label}" not in st.session_state:
+                    st.session_state[f"icon_{label}"] = classmap[label]
+                st.selectbox(label, key=f"icon_{label}", options=emojis)
+                classmap[label] = st.session_state[f"icon_{label}"]
+        # if st.button("Reset to defaults"):
+        #     st.session_state.update(**get_home_page_defaults())
+        #     # time.sleep 2 secs
+        #     import time
+        #     time.sleep(1)
+        #     # st.legacy_caching.clear_cache()
+        #     st.experimental_rerun()

src/subpages/inspect.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Inspect your whole dataset, either unfiltered or by id."""
+import streamlit as st
+from src.subpages.page import Context, Page
+from src.utils import aggrid_interactive_table, colorize_classes
+class InspectPage(Page):
+    name = "Inspect"
+    icon = "search"
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("💡", expanded=True):
+            st.write("Inspect your whole dataset, either unfiltered or by id.")
+        df = context.df_tokens
+        cols = (
+            "ids input_ids token_type_ids word_ids losses tokens labels preds total_loss".split()
+        )
+        if "token_type_ids" not in df.columns:
+            cols.remove("token_type_ids")
+        df = df.drop("hidden_states", axis=1).drop("attention_mask", axis=1)[cols]
+        if st.checkbox("Filter by id", value=True):
+            ids = list(sorted(map(int, df.ids.unique())))
+            next_id = st.session_state.get("next_id", 0)
+            example_id = st.selectbox("Select an example", ids, index=next_id)
+            df = df[df.ids == str(example_id)][1:-1]
+            # st.dataframe(colorize_classes(df).format(precision=3).bar(subset="losses"))  # type: ignore
+            st.dataframe(colorize_classes(df.round(3).astype(str)))
+            # if st.button("➡️ Next example"):
+            #     st.session_state.next_id = (ids.index(example_id) + 1) % len(ids)
+            #     st.experimental_rerun()
+            # if st.button("⬅️ Previous example"):
+            #     st.session_state.next_id = (ids.index(example_id) - 1) % len(ids)
+            #     st.experimental_rerun()
+        else:
+            aggrid_interactive_table(df.round(3))

src/subpages/losses.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Show count, mean and median loss per token and label."""
+import streamlit as st
+from src.subpages.page import Context, Page
+from src.utils import AgGrid, aggrid_interactive_table
+@st.cache
+def get_loss_by_token(df_tokens):
+    return (
+        df_tokens.groupby("tokens")[["losses"]]
+        .agg(["count", "mean", "median", "sum"])
+        .droplevel(level=0, axis=1)  # Get rid of multi-level columns
+        .sort_values(by="sum", ascending=False)
+        .reset_index()
+    )
+@st.cache
+def get_loss_by_label(df_tokens):
+    return (
+        df_tokens.groupby("labels")[["losses"]]
+        .agg(["count", "mean", "median", "sum"])
+        .droplevel(level=0, axis=1)
+        .sort_values(by="mean", ascending=False)
+        .reset_index()
+    )
+class LossesPage(Page):
+    name = "Loss by Token/Label"
+    icon = "sort-alpha-down"
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("💡", expanded=True):
+            st.write("Show count, mean and median loss per token and label.")
+            st.write(
+                "Look out for tokens that have a big gap between mean and median, indicating systematic labeling issues."
+            )
+        col1, _, col2 = st.columns([8, 1, 6])
+        with col1:
+            st.subheader("💬 Loss by Token")
+            st.session_state["_merge_tokens"] = st.checkbox(
+                "Merge tokens", value=True, key="merge_tokens"
+            )
+            loss_by_token = (
+                get_loss_by_token(context.df_tokens_merged)
+                if st.session_state["merge_tokens"]
+                else get_loss_by_token(context.df_tokens_cleaned)
+            )
+            aggrid_interactive_table(loss_by_token.round(3))
+            # st.subheader("🏷️ Loss by Label")
+            # loss_by_label = get_loss_by_label(df_tokens_cleaned)
+            # st.dataframe(loss_by_label)
+            st.write(
+                "_Caveat: Even though tokens have contextual representations, we average them to get these summary statistics._"
+            )
+        with col2:
+            st.subheader("🏷️ Loss by Label")
+            loss_by_label = get_loss_by_label(context.df_tokens_cleaned)
+            AgGrid(loss_by_label.round(3), height=200)

src/subpages/lossy_samples.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""Show every example sorted by loss (descending) for close inspection."""
+import pandas as pd
+import streamlit as st
+from src.subpages.page import Context, Page
+from src.utils import (
+    colorize_classes,
+    get_bg_color,
+    get_fg_color,
+    htmlify_labeled_example,
+)
+class LossySamplesPage(Page):
+    name = "Samples by Loss"
+    icon = "sort-numeric-down-alt"
+    def _get_widget_defaults(self):
+        return {
+            "skip_correct": True,
+            "samples_by_loss_show_df": True,
+        }
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("💡", expanded=True):
+            st.write("Show every example sorted by loss (descending) for close inspection.")
+            st.write(
+                "The **dataframe** is mostly self-explanatory. The cells are color-coded by label, a lighter color signifies a continuation label. Cells in the loss row are filled red from left to right relative to the top loss."
+            )
+            st.write(
+                "The **numbers to the left**: Top (black background) are sample number (listed here) and sample index (from the dataset). Below on yellow background is the total loss for the given sample."
+            )
+            st.write(
+                "The **annotated sample**: Every predicted entity (every token, really) gets a black border. The text color signifies the predicted label, with the first token of a sequence of token also showing the label's icon. If (and only if) the prediction is wrong, a small little box after the entity (token) contains the correct target class, with a background color corresponding to that class."
+            )
+        st.subheader("💥 Samples ⬇loss")
+        skip_correct = st.checkbox("Skip correct examples", value=True, key="skip_correct")
+        show_df = st.checkbox("Show dataframes", key="samples_by_loss_show_df")
+        st.write(
+            """<style>
+thead {
+    display: none;
+}
+td {
+    white-space: nowrap;
+    padding: 0 5px !important;
+}
+</style>""",
+            unsafe_allow_html=True,
+        )
+        top_indices = (
+            context.df.sort_values(by="total_loss", ascending=False)
+            .query("total_loss > 0.5")
+            .index
+        )
+        cnt = 0
+        for idx in top_indices:
+            sample = context.df_tokens_merged.loc[idx]
+            if isinstance(sample, pd.Series):
+                continue
+            if skip_correct and sum(sample.labels != sample.preds) == 0:
+                continue
+            if show_df:
+                def colorize_col(col):
+                    if col.name == "labels" or col.name == "preds":
+                        bgs = []
+                        fgs = []
+                        ops = []
+                        for v in col.values:
+                            bgs.append(get_bg_color(v.split("-")[1]) if "-" in v else "#ffffff")
+                            fgs.append(get_fg_color(bgs[-1]))
+                            ops.append("1" if v.split("-")[0] == "B" or v == "O" else "0.5")
+                        return [
+                            f"background-color: {bg}; color: {fg}; opacity: {op};"
+                            for bg, fg, op in zip(bgs, fgs, ops)
+                        ]
+                    return [""] * len(col)
+                df = sample.reset_index().drop(["index", "hidden_states", "ids"], axis=1).round(3)
+                losses_slice = pd.IndexSlice["losses", :]
+                # x = df.T.astype(str)
+                # st.dataframe(x)
+                # st.dataframe(x.loc[losses_slice])
+                styler = (
+                    df.T.style.apply(colorize_col, axis=1)
+                    .bar(subset=losses_slice, axis=1)
+                    .format(precision=3)
+                )
+                # styler.data = styler.data.astype(str)
+                st.write(styler.to_html(), unsafe_allow_html=True)
+                st.write("")
+                # st.dataframe(colorize_classes(sample.drop("hidden_states", axis=1)))#.bar(subset='losses'))  # type: ignore
+                # st.write(
+                #     colorize_errors(sample.round(3).drop("hidden_states", axis=1).astype(str))
+                # )
+            col1, _, col2 = st.columns([3.5 / 32, 0.5 / 32, 28 / 32])
+            cnt += 1
+            counter = f"<span title='#sample | index' style='display: block; background-color: black; opacity: 1; color: white; padding: 0 5px'>[{cnt} | {idx}]</span>"
+            loss = f"<span title='total loss' style='display: block; background-color: yellow; color: gray; padding: 0 5px;'>𝐿 {sample.losses.sum():.3f}</span>"
+            col1.write(f"{counter}{loss}", unsafe_allow_html=True)
+            col1.write("")
+            col2.write(htmlify_labeled_example(sample), unsafe_allow_html=True)
+            # st.write(f"[{i};{idx}] " + htmlify_corr_sample(sample), unsafe_allow_html=True)

src/subpages/metrics.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+The metrics page contains precision, recall and f-score metrics as well as a confusion matrix over all the classes. By default, the confusion matrix is normalized. There's an option to zero out the diagonal, leaving only prediction errors (here it makes sense to turn off normalization, so you get raw error counts).
+"""
+import re
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import streamlit as st
+from seqeval.metrics import classification_report
+from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
+from src.subpages.page import Context, Page
+def _get_evaluation(df):
+    y_true = df.apply(lambda row: [lbl for lbl in row.labels if lbl != "IGN"], axis=1)
+    y_pred = df.apply(
+        lambda row: [pred for (pred, lbl) in zip(row.preds, row.labels) if lbl != "IGN"],
+        axis=1,
+    )
+    report: str = classification_report(y_true, y_pred, scheme="IOB2", digits=3)  # type: ignore
+    return report.replace(
+        "precision    recall  f1-score   support",
+        "=" * 12 + "  precision    recall  f1-score   support",
+    )
+def plot_confusion_matrix(y_true, y_preds, labels, normalize=None, zero_diagonal=True):
+    cm = confusion_matrix(y_true, y_preds, normalize=normalize, labels=labels)
+    if zero_diagonal:
+        np.fill_diagonal(cm, 0)
+    # st.write(plt.rcParams["font.size"])
+    # plt.rcParams.update({'font.size': 10.0})
+    fig, ax = plt.subplots(figsize=(10, 10))
+    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
+    fmt = "d" if normalize is None else ".3f"
+    disp.plot(
+        cmap="Blues",
+        include_values=True,
+        xticks_rotation="vertical",
+        values_format=fmt,
+        ax=ax,
+        colorbar=False,
+    )
+    return fig
+class MetricsPage(Page):
+    name = "Metrics"
+    icon = "graph-up-arrow"
+    def _get_widget_defaults(self):
+        return {
+            "normalize": True,
+            "zero_diagonal": False,
+        }
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("💡", expanded=True):
+            st.write(
+                "The metrics page contains precision, recall and f-score metrics as well as a confusion matrix over all the classes. By default, the confusion matrix is normalized. There's an option to zero out the diagonal, leaving only prediction errors (here it makes sense to turn off normalization, so you get raw error counts)."
+            )
+            st.write(
+                "With the confusion matrix, you don't want any of the classes to end up in the bottom right quarter: those are frequent but error-prone."
+            )
+        eval_results = _get_evaluation(context.df)
+        if len(eval_results.splitlines()) < 8:
+            col1, _, col2 = st.columns([8, 1, 1])
+        else:
+            col1 = col2 = st
+        col1.subheader("🎯 Evaluation Results")
+        col1.code(eval_results)
+        results = [re.split(r" +", l.lstrip()) for l in eval_results.splitlines()[2:-4]]
+        data = [(r[0], int(r[-1]), float(r[-2])) for r in results]
+        df = pd.DataFrame(data, columns="class support f1".split())
+        fig = px.scatter(
+            df,
+            x="support",
+            y="f1",
+            range_y=(0, 1.05),
+            color="class",
+        )
+        # fig.update_layout(title_text="asdf", title_yanchor="bottom")
+        col1.plotly_chart(fig)
+        col2.subheader("🔠 Confusion Matrix")
+        normalize = None if not col2.checkbox("Normalize", key="normalize") else "true"
+        zero_diagonal = col2.checkbox("Zero Diagonal", key="zero_diagonal")
+        col2.pyplot(
+            plot_confusion_matrix(
+                y_true=context.df_tokens_cleaned["labels"],
+                y_preds=context.df_tokens_cleaned["preds"],
+                labels=context.labels,
+                normalize=normalize,
+                zero_diagonal=zero_diagonal,
+            ),
+        )

src/subpages/misclassified.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""This page contains all misclassified examples and allows filtering by specific error types."""
+from collections import defaultdict
+import pandas as pd
+import streamlit as st
+from sklearn.metrics import confusion_matrix
+from src.subpages.page import Context, Page
+from src.utils import htmlify_labeled_example
+class MisclassifiedPage(Page):
+    name = "Misclassified"
+    icon = "x-octagon"
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("💡", expanded=True):
+            st.write(
+                "This page contains all misclassified examples and allows filtering by specific error types."
+            )
+        misclassified_indices = context.df_tokens_merged.query("labels != preds").index.unique()
+        misclassified_samples = context.df_tokens_merged.loc[misclassified_indices]
+        cm = confusion_matrix(
+            misclassified_samples.labels,
+            misclassified_samples.preds,
+            labels=context.labels,
+        )
+        # st.pyplot(
+        #     plot_confusion_matrix(
+        #         y_preds=misclassified_samples["preds"],
+        #         y_true=misclassified_samples["labels"],
+        #         labels=labels,
+        #         normalize=None,
+        #         zero_diagonal=True,
+        #     ),
+        # )
+        df = pd.DataFrame(cm, index=context.labels, columns=context.labels).astype(str)
+        import numpy as np
+        np.fill_diagonal(df.values, "")
+        st.dataframe(df.applymap(lambda x: x if x != "0" else ""))
+        # import matplotlib.pyplot as plt
+        # st.pyplot(df.style.background_gradient(cmap='RdYlGn_r').to_html())
+        # selection = aggrid_interactive_table(df)
+        # st.write(df.to_html(escape=False, index=False), unsafe_allow_html=True)
+        confusions = defaultdict(int)
+        for i, row in enumerate(cm):
+            for j, _ in enumerate(row):
+                if i == j or cm[i][j] == 0:
+                    continue
+                confusions[(context.labels[i], context.labels[j])] += cm[i][j]
+        def format_func(item):
+            return (
+                f"true: {item[0][0]} <> pred: {item[0][1]} ||| count: {item[1]}" if item else "All"
+            )
+        conf = st.radio(
+            "Filter by Class Confusion",
+            options=list(zip(confusions.keys(), confusions.values())),
+            format_func=format_func,
+        )
+        # st.write(
+        #     f"**Filtering Examples:** True class: `{conf[0][0]}`, Predicted class: `{conf[0][1]}`"
+        # )
+        filtered_indices = misclassified_samples.query(
+            f"labels == '{conf[0][0]}' and preds == '{conf[0][1]}'"
+        ).index
+        for i, idx in enumerate(filtered_indices):
+            sample = context.df_tokens_merged.loc[idx]
+            st.write(
+                htmlify_labeled_example(sample),
+                unsafe_allow_html=True,
+            )
+            st.write("---")

src/subpages/page.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from dataclasses import dataclass
+from typing import Any
+import pandas as pd
+from datasets import Dataset  # type: ignore
+from sentence_transformers import SentenceTransformer
+from transformers import AutoModelForSequenceClassification  # type: ignore
+from transformers import AutoTokenizer  # type: ignore
+@dataclass
+class Context:
+    """This object facilitates passing around the application's state between different pages."""
+    model: AutoModelForSequenceClassification
+    tokenizer: AutoTokenizer
+    sentence_encoder: SentenceTransformer
+    tags: Any
+    df: pd.DataFrame
+    df_tokens: pd.DataFrame
+    df_tokens_cleaned: pd.DataFrame
+    df_tokens_merged: pd.DataFrame
+    split_sample_size: int
+    ds_name: str
+    ds_config_name: str
+    ds_split_name: str
+    split: Dataset
+    labels: list[str]
+class Page:
+    """This class encapsulates the logic for a single page of the application."""
+    name: str
+    """The page's name that will be used in the sidebar menu."""
+    icon: str
+    """The page's icon that will be used in the sidebar menu."""
+    def _get_widget_defaults(self):
+        """This function holds the default settings for all widgets contained on this page.
+        Returns:
+            dict: A dictionary of widget defaults, where the keys are the widget names and the values are the default.
+        """
+        return {}
+    def render(self, context):
+        """This function renders the page."""
+        ...

src/subpages/probing.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""
+A very direct and interactive way to test your model is by providing it with a list of text inputs and then inspecting the model outputs. The application features a multiline text field so the user can input multiple texts separated by newlines. For each text, the app will show a data frame containing the tokenized string, token predictions, probabilities and a visual indicator for low probability predictions -- these are the ones you should inspect first for prediction errors.
+"""
+import streamlit as st
+from src.subpages.page import Context, Page
+from src.utils import device, tag_text
+_DEFAULT_SENTENCES = """
+Damit hatte er auf ihr letztes , völlig schiefgelaufenes Geschäftsessen angespielt .
+Damit einher geht übrigens auch , dass Marcella , Collocinis Tochter , keine wie auch immer geartete strafrechtliche Verfolgung zu befürchten hat .
+Nach dem Bell ’ schen Theorem , einer Physik jenseits der Quanten , ist die Welt , die wir für real halten , nicht objektivierbar .
+Dazu muss man wiederum wissen , dass die Aussagekraft von Tests , neben der Sensitivität und Spezifität , ganz entscheidend von der Vortestwahrscheinlichkeit abhängt .
+Haben Sie sich schon eingelebt ? « erkundigte er sich .
+Das Auto ein Totalschaden , mein Beifahrer ein weinender Jammerlappen .
+Seltsam , wunderte sie sich , dass das Stück nach mehr als eineinhalb Jahrhunderten noch so gut in Schuss ist .
+Oder auf den Strich gehen , Strümpfe stricken , Geld hamstern .
+Und Allah ist Allumfassend Allwissend .
+Und Pedro Moacir redete weiter : » Verzicht , Pater Antonio , Verzicht , zu großer Schmerz über Verzicht , Sehnsucht , die sich nicht erfüllt , die sich nicht erfüllen kann , das sind Qualen , die ein Verstummen nach sich ziehen können , oder Härte .
+Mama-San ging mittlerweile fast ausnahmslos nur mit Wei an ihrer Seite aus dem Haus , kaum je mit einem der Mädchen und niemals allein.
+""".strip()
+_DEFAULT_SENTENCES = """
+Elon Musk’s Berghain humiliation — I know the feeling
+Musk was also seen at a local spot called Sisyphos celebrating entrepreneur Adeo Ressi's birthday, according to The Times.
+""".strip()
+class ProbingPage(Page):
+    name = "Probing"
+    icon = "fonts"
+    def _get_widget_defaults(self):
+        return {"probing_textarea": _DEFAULT_SENTENCES}
+    def render(self, context: Context):
+        st.title("🔠 Interactive Probing")
+        with st.expander("💡", expanded=True):
+            st.write(
+                "A very direct and interactive way to test your model is by providing it with a list of text inputs and then inspecting the model outputs. The application features a multiline text field so the user can input multiple texts separated by newlines. For each text, the app will show a data frame containing the tokenized string, token predictions, probabilities and a visual indicator for low probability predictions -- these are the ones you should inspect first for prediction errors."
+            )
+        sentences = st.text_area("Sentences", height=200, key="probing_textarea")
+        if not sentences.strip():
+            return
+        sentences = [sentence.strip() for sentence in sentences.splitlines()]
+        for sent in sentences:
+            sent = sent.replace(",", "").replace("  ", " ")
+            with st.expander(sent):
+                tagged = tag_text(sent, context.tokenizer, context.model, device)
+                tagged = tagged.astype(str)
+                tagged["probs"] = tagged["probs"].apply(lambda x: x[:-2])
+                tagged["check"] = tagged["probs"].apply(
+                    lambda x: "✅ ✅" if int(x) < 100 else "✅" if int(x) < 1000 else ""
+                )
+                st.dataframe(tagged.drop("hidden_states", axis=1).T)

src/subpages/random_samples.py ADDED Viewed

	@@ -0,0 +1,50 @@

+"""Show random samples. Simple method, but it often turns up interesting things."""
+import pandas as pd
+import streamlit as st
+from src.subpages.page import Context, Page
+from src.utils import htmlify_labeled_example
+class RandomSamplesPage(Page):
+    name = "Random Samples"
+    icon = "shuffle"
+    def _get_widget_defaults(self):
+        return {
+            "random_sample_size_min": 128,
+        }
+    def render(self, context: Context):
+        st.title("🎲 Random Samples")
+        with st.expander("💡", expanded=True):
+            st.write(
+                "Show random samples. Simple method, but it often turns up interesting things."
+            )
+        random_sample_size = st.number_input(
+            "Random sample size:",
+            value=min(st.session_state.random_sample_size_min, context.split_sample_size),
+            step=16,
+            key="random_sample_size",
+        )
+        if st.button("🎲 Resample"):
+            st.experimental_rerun()
+        random_indices = context.df.sample(int(random_sample_size)).index
+        samples = context.df_tokens_merged.loc[random_indices]
+        for i, idx in enumerate(random_indices):
+            sample = samples.loc[idx]
+            if isinstance(sample, pd.Series):
+                continue
+            col1, _, col2 = st.columns([0.08, 0.025, 0.8])
+            counter = f"<span title='#sample | index' style='display: block; background-color: black; opacity: 1; color: wh^; padding: 0 5px'>[{i+1} | {idx}]</span>"
+            loss = f"<span title='total loss' style='display: block; background-color: yellow; color: gray; padding: 0 5px;'>𝐿 {sample.losses.sum():.3f}</span>"
+            col1.write(f"{counter}{loss}", unsafe_allow_html=True)
+            col1.write("")
+            col2.write(htmlify_labeled_example(sample), unsafe_allow_html=True)

src/subpages/raw_data.py ADDED Viewed

	@@ -0,0 +1,57 @@

+"""See the data as seen by your model."""
+import pandas as pd
+import streamlit as st
+from src.subpages.page import Context, Page
+from src.utils import aggrid_interactive_table
+@st.cache
+def convert_df(df):
+    return df.to_csv().encode("utf-8")
+class RawDataPage(Page):
+    name = "Raw data"
+    icon = "qr-code"
+    def render(self, context: Context):
+        st.title(self.name)
+        with st.expander("💡", expanded=True):
+            st.write("See the data as seen by your model.")
+        st.subheader("Dataset")
+        st.code(
+            f"Dataset: {context.ds_name}\nConfig: {context.ds_config_name}\nSplit: {context.ds_split_name}"
+        )
+        st.write("**Data after processing and inference**")
+        processed_df = (
+            context.df_tokens.drop("hidden_states", axis=1).drop("attention_mask", axis=1).round(3)
+        )
+        cols = (
+            "ids input_ids token_type_ids word_ids losses tokens labels preds total_loss".split()
+        )
+        if "token_type_ids" not in processed_df.columns:
+            cols.remove("token_type_ids")
+        processed_df = processed_df[cols]
+        aggrid_interactive_table(processed_df)
+        processed_df_csv = convert_df(processed_df)
+        st.download_button(
+            "Download csv",
+            processed_df_csv,
+            "processed_data.csv",
+            "text/csv",
+        )
+        st.write("**Raw data (exploded by tokens)**")
+        raw_data_df = context.split.to_pandas().apply(pd.Series.explode)  # type: ignore
+        aggrid_interactive_table(raw_data_df)
+        raw_data_df_csv = convert_df(raw_data_df)
+        st.download_button(
+            "Download csv",
+            raw_data_df_csv,
+            "raw_data.csv",
+            "text/csv",
+        )

src/utils.py ADDED Viewed

	@@ -0,0 +1,255 @@

+from pathlib import Path
+import matplotlib as matplotlib
+import matplotlib.cm as cm
+import pandas as pd
+import streamlit as st
+import tokenizers
+import torch
+import torch.nn.functional as F
+from st_aggrid import AgGrid, GridOptionsBuilder, GridUpdateMode
+PROJ = Path(__file__).parent
+tokenizer_hash_funcs = {
+    tokenizers.Tokenizer: lambda _: None,
+    tokenizers.AddedToken: lambda _: None,
+}
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu" if torch.has_mps else "cpu")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+classmap = {
+    "O": "O",
+    "PER": "🙎",
+    "person": "🙎",
+    "LOC": "🌎",
+    "location": "🌎",
+    "ORG": "🏤",
+    "corporation": "🏤",
+    "product": "📱",
+    "creative": "🎷",
+    "MISC": "🎷",
+}
+def aggrid_interactive_table(df: pd.DataFrame) -> dict:
+    """Creates an st-aggrid interactive table based on a dataframe.
+    Args:
+        df (pd.DataFrame]): Source dataframe
+    Returns:
+        dict: The selected row
+    """
+    options = GridOptionsBuilder.from_dataframe(
+        df, enableRowGroup=True, enableValue=True, enablePivot=True
+    )
+    options.configure_side_bar()
+    # options.configure_default_column(cellRenderer=JsCode('''function(params) {return '<a href="#samples-loss">'+params.value+'</a>'}'''))
+    options.configure_selection("single")
+    selection = AgGrid(
+        df,
+        enable_enterprise_modules=True,
+        gridOptions=options.build(),
+        theme="light",
+        update_mode=GridUpdateMode.NO_UPDATE,
+        allow_unsafe_jscode=True,
+    )
+    return selection
+def explode_df(df: pd.DataFrame) -> pd.DataFrame:
+    """Takes a dataframe and explodes all the fields."""
+    df_tokens = df.apply(pd.Series.explode)
+    if "losses" in df.columns:
+        df_tokens["losses"] = df_tokens["losses"].astype(float)
+    return df_tokens  # type: ignore
+def align_sample(row: pd.Series):
+    """Uses word_ids to align all lists in a sample."""
+    columns = row.axes[0].to_list()
+    indices = [i for i, id in enumerate(row.word_ids) if id >= 0 and id != row.word_ids[i - 1]]
+    out = {}
+    tokens = []
+    for i, tok in enumerate(row.tokens):
+        if row.word_ids[i] == -1:
+            continue
+        if row.word_ids[i] != row.word_ids[i - 1]:
+            tokens.append(tok.lstrip("▁").lstrip("##").rstrip("@@"))
+        else:
+            tokens[-1] += tok.lstrip("▁").lstrip("##").rstrip("@@")
+    out["tokens"] = tokens
+    if "preds" in columns:
+        out["preds"] = [row.preds[i] for i in indices]
+    if "labels" in columns:
+        out["labels"] = [row.labels[i] for i in indices]
+    if "losses" in columns:
+        out["losses"] = [row.losses[i] for i in indices]
+    if "probs" in columns:
+        out["probs"] = [row.probs[i] for i in indices]
+    if "hidden_states" in columns:
+        out["hidden_states"] = [row.hidden_states[i] for i in indices]
+    if "ids" in columns:
+        out["ids"] = row.ids
+    assert len(tokens) == len(out["preds"]), (tokens, row.tokens)
+    return out
+@st.cache(
+    allow_output_mutation=True,
+    hash_funcs=tokenizer_hash_funcs,
+)
+def tag_text(text: str, tokenizer, model, device: torch.device) -> pd.DataFrame:
+    """Tags a given text and creates an (exploded) DataFrame with the predicted labels and probabilities.
+    Args:
+        text (str): The text to be processed
+        tokenizer: Tokenizer to use
+        model (_type_): Model to use
+        device (torch.device): The device we want pytorch to use for its calcultaions.
+    Returns:
+        pd.DataFrame: A data frame holding the tagged text.
+    """
+    tokens = tokenizer(text).tokens()
+    tokenized = tokenizer(text, return_tensors="pt")
+    word_ids = [w if w is not None else -1 for w in tokenized.word_ids()]
+    input_ids = tokenized.input_ids.to(device)
+    outputs = model(input_ids, output_hidden_states=True)
+    preds = torch.argmax(outputs.logits, dim=2)
+    preds = [model.config.id2label[p] for p in preds[0].cpu().numpy()]
+    hidden_states = outputs.hidden_states[-1][0].detach().cpu().numpy()
+    # hidden_states = np.mean([hidden_states, outputs.hidden_states[0][0].detach().cpu().numpy()], axis=0)
+    probs = 1 // (
+        torch.min(F.softmax(outputs.logits, dim=-1), dim=-1).values[0].detach().cpu().numpy()
+    )
+    df = pd.DataFrame(
+        [[tokens, word_ids, preds, probs, hidden_states]],
+        columns="tokens word_ids preds probs hidden_states".split(),
+    )
+    merged_df = pd.DataFrame(df.apply(align_sample, axis=1).tolist())
+    return explode_df(merged_df).reset_index().drop(columns=["index"])
+def get_bg_color(label: str):
+    """Retrieves a label's color from the session state."""
+    return st.session_state[f"color_{label}"]
+def get_fg_color(bg_color_hex: str) -> str:
+    """Chooses the proper (foreground) text color (black/white) for a given background color, maximizing contrast.
+    Adapted from https://gomakethings.com/dynamically-changing-the-text-color-based-on-background-color-contrast-with-vanilla-js/
+    Args:
+        bg_color_hex (str): The background color given as a HEX stirng.
+    Returns:
+        str: Either "black" or "white".
+    """
+    r = int(bg_color_hex[1:3], 16)
+    g = int(bg_color_hex[3:5], 16)
+    b = int(bg_color_hex[5:7], 16)
+    yiq = ((r * 299) + (g * 587) + (b * 114)) / 1000
+    return "black" if (yiq >= 128) else "white"
+def colorize_classes(df: pd.DataFrame) -> pd.DataFrame:
+    """Colorizes the errors in the dataframe."""
+    def colorize_row(row):
+        return [
+            "background-color: "
+            + ("white" if (row["labels"] == "IGN" or (row["preds"] == row["labels"])) else "pink")
+            + ";"
+        ] * len(row)
+    def colorize_col(col):
+        if col.name == "labels" or col.name == "preds":
+            bgs = []
+            fgs = []
+            for v in col.values:
+                bgs.append(get_bg_color(v.split("-")[1]) if "-" in v else "#ffffff")
+                fgs.append(get_fg_color(bgs[-1]))
+            return [f"background-color: {bg}; color: {fg};" for bg, fg in zip(bgs, fgs)]
+        return [""] * len(col)
+    df = df.reset_index().drop(columns=["index"]).T
+    return df  # .style.apply(colorize_col, axis=0)
+def htmlify_labeled_example(example: pd.DataFrame) -> str:
+    """Builds an HTML (string) representation of a single example.
+    Args:
+        example (pd.DataFrame): The example to process.
+    Returns:
+        str: An HTML string representation of a single example.
+    """
+    html = []
+    for _, row in example.iterrows():
+        pred = row.preds.split("-")[1] if "-" in row.preds else "O"
+        label = row.labels
+        label_class = row.labels.split("-")[1] if "-" in row.labels else "O"
+        color = get_bg_color(row.preds.split("-")[1]) if "-" in row.preds else "#000000"
+        true_color = get_bg_color(row.labels.split("-")[1]) if "-" in row.labels else "#000000"
+        font_color = get_fg_color(color) if color else "white"
+        true_font_color = get_fg_color(true_color) if true_color else "white"
+        is_correct = row.preds == row.labels
+        loss_html = (
+            ""
+            if float(row.losses) < 0.01
+            else f"<span style='background-color: yellow; color: font_color; padding: 0 5px;'>{row.losses:.3f}</span>"
+        )
+        loss_html = ""
+        if row.labels == row.preds == "O":
+            html.append(f"<span>{row.tokens}</span>")
+        elif row.labels == "IGN":
+            assert False
+        else:
+            opacity = "1" if not is_correct else "0.5"
+            correct = (
+                ""
+                if is_correct
+                else f"<span title='{label}' style='background-color: {true_color}; opacity: 1; color: {true_font_color}; padding: 0 5px; border: 1px solid black; min-width: 30px'>{classmap[label_class]}</span>"
+            )
+            pred_icon = classmap[pred] if pred != "O" and row.preds[:2] != "I-" else ""
+            html.append(
+                f"<span style='border: 1px solid black; color: {color}; padding: 0 5px;' title={row.preds}>{pred_icon + ' '}{row.tokens}</span>{correct}{loss_html}"
+            )
+    return " ".join(html)
+def color_map_color(value: float, cmap_name="Set1", vmin=0, vmax=1) -> str:
+    """Turns a value into a color using a color map."""
+    norm = matplotlib.colors.Normalize(vmin=vmin, vmax=vmax)
+    cmap = cm.get_cmap(cmap_name)  # PiYG
+    rgba = cmap(norm(abs(value)))
+    color = matplotlib.colors.rgb2hex(rgba[:3])
+    return color