Spaces:

hs-knowledge
/

ner_app

Sleeping

App Files Files Community

finiteautomata commited on Mar 31, 2023

Commit

8739181

1 Parent(s): d01b5c5

First commit

Browse files

Files changed (3) hide show

.gitignore +136 -0
app.py +82 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,136 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+data/*
+config/*.ini
+*.bin
+**/**/hs_clf
+models/*
+wandb/*
+*.pt

app.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# Streamlit app to highlight NER entities
+import random
+import streamlit as st
+from datasets import load_dataset
+from annotated_text import annotated_text
+# Load data
+ds = load_dataset("hs-knowledge/hateval_ner")
+ds_2 = load_dataset("hs-knowledge/hateval_ner_2")
+# Show highlighted ner entities in a tweet
+def display_text(example):
+    # Use annotated_text to show entities
+    ner_output = example["ner_output"]
+    chunks = []
+    current_chunk = ""
+    current_type = None
+    # Check if there are two labels repeated
+    previous_label = None
+    for label in ner_output["labels"]:
+        if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
+            pass
+        previous_label = label
+    for token, label in zip(ner_output["tokens"], ner_output["labels"]):
+        if label is None:
+            # Perhaps it is too long
+            continue
+        if label == "O":
+            if current_type is not None:
+                # Add previous entity
+                chunks.append((current_chunk.strip(), current_type))
+                current_chunk = token + " "
+                current_type = None
+            else:
+                current_chunk += token + " "
+                current_type = None
+        elif label.startswith("B-"):
+            if current_chunk:
+                chunks.append((current_chunk.strip(), current_type))
+            current_chunk = token + " "
+            current_type = label[2:]
+        elif label.startswith('I-'):
+            current_chunk += token + " "
+            current_type = label[2:]
+        else:
+            # It doesn't start with B- or I- => add single token
+            if label != current_type:
+                chunks.append((current_chunk.strip(), current_type))
+                current_chunk = token + " "
+                current_type = label
+            else:
+                current_chunk += token + " "
+                current_type = label
+    if current_chunk:
+        chunks.append((current_chunk.strip(), current_type))
+    # remove nones
+    chunks = [(c, t) if t is not None else c for c, t in chunks]
+    annotated_text(*chunks)
+# Get first 1000 examples
+elements = random.choices(range(len(ds["train"])), k=300)
+ds["train"] = ds["train"].select(elements)
+ds_2["train"] = ds_2["train"].select(elements)
+for ex1, ex2 in zip(ds["train"], ds_2["train"]):
+    st.write("====================================")
+    st.write("NER model: robertuito", "\n")
+    display_text(ex1)
+    st.write("NER model: roberta-large", "\n")
+    display_text(ex2)
+    st.write("\n")
+    st.write(f"Original text: {ex1['text']}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+datasets==2.9.0
+streamlit==1.18.0
+st-annotated-text==3.0.0