turkish-named-entity-recognition-tests

Runtime error

File size: 5,936 Bytes

e6bfe5c
 
85d8808
8d47d74
e6bfe5c
 
d25abcf
e6bfe5c
 
 
 
 
 
 
 
 
c4cf7b8
 
e6bfe5c
 
f474e98
 
 
edbb9d6
 
f474e98
e6bfe5c
f474e98
 
a834bc3
f474e98
 
 
e6bfe5c
41ca91f
a834bc3
f474e98
 
edbb9d6
f474e98
 
 
 
 
 
a834bc3
8a8244f
e6bfe5c
f474e98
1eab2d6
 
f474e98
 
 
e6bfe5c
f474e98
 
e6bfe5c
39e80d1
e6bfe5c
a834bc3
 
e6bfe5c
5b3d11c
24ff6bd
d84f151
 
 
 
e6bfe5c
 
 
 
 
 
 
 
41ca91f
 
 
 
e6bfe5c
 
d25abcf
e6bfe5c
d25abcf
 
 
 
 
 
 
 
 
 
 
 
 
7ae6e02
 
 
d25abcf

# Turkish NER Demo for Various Models

from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, DebertaV2Tokenizer, DebertaV2Model
import sentencepiece
import streamlit as st
import pandas as pd
import spacy

example_list = [
    "Mustafa Kemal Atatürk 1919 yılında Samsun'a çıktı.", 
    """Dünya çapında 40 milyondan fazla insana bulaşan ve 1.1 milyondan fazla insanın ölümüne sebep olan corona virüsüne karşı Pfizer ile BioNTech'in geliştirdiği aşının ilk görüntüleri ortaya çıktı. Aşının fabrikadaki ilk görüntülerini değerlendiren Pfizer'ın Birleşik Krallık CEO'su, "Üretim bandında aşıyı görmek beni neşelendirdi" dedi.
ABD merkezli çokuluslu ilaç şirketi Pfizer ile Türk bilim insanlarının kurduğu BioNTech’in geliştirdiği corona virüsü aşısında sona gelindi… Pfizer, paylaştığı video ile bütün dünyayı heyecanlandıran gelişmeyi duyurdu.
Şirket, Belçika’daki Puurs’ta geliştirilen Covid-19 aşılarının seri üretim bandındaki üretim aşamasını uluslararası kamuoyu ile paylaştı. Almanya’nın Mainz kentinde Türk profesör Uğur Şahin ile eşi Özlem Türeci’nin kurduğu ve yönettiği biyoteknoloji şirketi BioNTech ile aşı sürecini sürdüren Pfizer’ın küçük şişelerde binlerce corona virüsü aşısı üretmeye başladığı belirtildi.
Pfizer, aşının güvenli ve etkili olduğunun klinik olarak da kanıtlanması ve resmi mercilerden de onay alınması durumunda üretilen aşının dağıtılacağını duyurdu."""
]

st.set_page_config(layout="wide")

st.title("Demo for Turkish NER Models")

model_list = ['akdeniz27/bert-base-turkish-cased-ner',
             'akdeniz27/convbert-base-turkish-cased-ner',
             'akdeniz27/xlm-roberta-base-turkish-ner',
             'xlm-roberta-large-finetuned-conll03-english',
             'tner/tner-xlm-roberta-base-ontonotes5']

st.sidebar.header("Select NER Model")
model_checkpoint = st.sidebar.radio("", model_list)

st.sidebar.write("For details of models: 'https://huggingface.co/akdeniz27/")
st.sidebar.write("")

xlm_agg_strategy_info = "'aggregation_strategy' can be selected as 'simple' or 'none' for 'xlm-roberta' because of the RoBERTa model's tokenization approach."

st.sidebar.header("Select Aggregation Strategy Type")
if model_checkpoint == "akdeniz27/xlm-roberta-base-turkish-ner":
    aggregation = st.sidebar.radio("", ('simple', 'none'))
    st.sidebar.write(xlm_agg_strategy_info)
elif model_checkpoint == "xlm-roberta-large-finetuned-conll03-english" or model_checkpoint == "tner/tner-xlm-roberta-base-ontonotes5":
    aggregation = st.sidebar.radio("", ('simple', 'none'))
    st.sidebar.write(xlm_agg_strategy_info)
    st.sidebar.write("")
    st.sidebar.write("This English NER model is included just to show the zero-shot transfer learning capability of XLM-Roberta.")
else:
    aggregation = st.sidebar.radio("", ('first', 'simple', 'average', 'max', 'none'))
    
st.sidebar.write("Please refer 'https://huggingface.co/transformers/_modules/transformers/pipelines/token_classification.html' for entity grouping with aggregation_strategy parameter.")

st.subheader("Select Text Input Method")
input_method = st.radio("", ('Select from Examples', 'Write or Paste New Text'))
if input_method == 'Select from Examples':
    selected_text = st.selectbox('Select Text from List', example_list, index=0, key=1)
    st.subheader("Text to Run")
    input_text = st.text_area("Selected Text", selected_text, height=128, max_chars=None, key=2)
elif input_method == "Write or Paste New Text":
    st.subheader("Text to Run")
    input_text = st.text_area('Write or Paste Text Below', value="", height=128, max_chars=None, key=2)

@st.cache(allow_output_mutation=True)
def setModel(model_checkpoint, aggregation):
    model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    return pipeline('ner', model=model, tokenizer=tokenizer, aggregation_strategy=aggregation)

@st.cache(allow_output_mutation=True)
def get_html(html: str):
    WRAPPER = """<div style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem">{}</div>"""
    html = html.replace("\n", " ")
    return WRAPPER.format(html)
        
Run_Button = st.button("Run", key=None)
if Run_Button == True:
    
    ner_pipeline = setModel(model_checkpoint, aggregation)
    output = ner_pipeline(input_text)

    df = pd.DataFrame.from_dict(output)
    if aggregation != "none":
        cols_to_keep = ['word','entity_group','score','start','end']
    else:
        cols_to_keep = ['word','entity','score','start','end']
    df_final = df[cols_to_keep]
    
    st.subheader("Recognized Entities")
    st.dataframe(df_final)

    st.subheader("Spacy Style Display")
    spacy_display = {}
    spacy_display["ents"] = []
    spacy_display["text"] = input_text
    spacy_display["title"] = None

    for entity in output:
        if aggregation != "none":
            spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity_group"]})
        else:
            spacy_display["ents"].append({"start": entity["start"], "end": entity["end"], "label": entity["entity"]})
    
    entity_list = ["PER", "LOC", "ORG", "MISC", "person", "location", "geopolitical area", "organization", "event", "group", "date", "ordinal number", "cardinal", "ordinal", "product", "quantity", "time", "money", "percent", "language", "product"]
    # colors = {'PER': '#85DCDF', 'LOC': '#DF85DC', 'ORG': '#DCDF85', 'MISC': '#85ABDF',}
    html = spacy.displacy.render(spacy_display, style="ent", minify=True, manual=True, options={"ents": entity_list}) # , "colors": colors})
    style = "<style>mark.entity { display: inline-block }</style>"
    st.write(f"{style}{get_html(html)}", unsafe_allow_html=True)