|
import streamlit as st |
|
import sparknlp |
|
import os |
|
import pandas as pd |
|
|
|
from sparknlp.base import * |
|
from sparknlp.annotator import * |
|
from pyspark.ml import Pipeline |
|
from sparknlp.pretrained import PretrainedPipeline |
|
from annotated_text import annotated_text |
|
|
|
|
|
st.set_page_config( |
|
layout="wide", |
|
initial_sidebar_state="auto" |
|
) |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main-title { |
|
font-size: 36px; |
|
color: #4A90E2; |
|
font-weight: bold; |
|
text-align: center; |
|
} |
|
.section { |
|
background-color: #f9f9f9; |
|
padding: 10px; |
|
border-radius: 10px; |
|
margin-top: 10px; |
|
} |
|
.section p, .section ul { |
|
color: #666666; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
@st.cache_resource |
|
def init_spark(): |
|
return sparknlp.start() |
|
|
|
@st.cache_resource |
|
def create_pipeline(model): |
|
document_assembler = DocumentAssembler() \ |
|
.setInputCol('text') \ |
|
.setOutputCol('document') |
|
|
|
tokenizer = Tokenizer() \ |
|
.setInputCols(['document']) \ |
|
.setOutputCol('token') |
|
|
|
tokenClassifier = CamemBertForTokenClassification() \ |
|
.pretrained(model, 'en') \ |
|
.setInputCols(['document', 'token']) \ |
|
.setOutputCol('ner') \ |
|
.setCaseSensitive(True) \ |
|
.setMaxSentenceLength(512) |
|
|
|
|
|
ner_converter = NerConverter() \ |
|
.setInputCols(['document', 'token', 'ner']) \ |
|
.setOutputCol('ner_chunk') |
|
|
|
pipeline = Pipeline(stages=[ |
|
document_assembler, |
|
tokenizer, |
|
tokenClassifier, |
|
ner_converter |
|
]) |
|
return pipeline |
|
|
|
def fit_data(pipeline, data): |
|
empty_df = spark.createDataFrame([['']]).toDF('text') |
|
pipeline_model = pipeline.fit(empty_df) |
|
model = LightPipeline(pipeline_model) |
|
result = model.fullAnnotate(data) |
|
return result |
|
|
|
def annotate(data): |
|
document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"] |
|
annotated_words = [] |
|
for chunk, label in zip(chunks, labels): |
|
parts = document.split(chunk, 1) |
|
if parts[0]: |
|
annotated_words.append(parts[0]) |
|
annotated_words.append((chunk, label)) |
|
document = parts[1] |
|
if document: |
|
annotated_words.append(document) |
|
annotated_text(*annotated_words) |
|
|
|
|
|
st.markdown('<div class="main-title">Recognize Entities with CamemBERT</div>', unsafe_allow_html=True) |
|
st.markdown(""" |
|
<div class="section"> |
|
<p>This model performs Named Entity Recognition (NER) using CamemBERT, a powerful language model fine-tuned specifically for French. It can accurately identify entities such as locations, organizations, persons, and miscellaneous categories in texts.</p> |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
model = st.sidebar.selectbox( |
|
"Choose the pretrained model", |
|
['camembert_base_token_classifier_wikiner'], |
|
help="For more info about the models visit: https://sparknlp.org/models" |
|
) |
|
|
|
|
|
link = """ |
|
<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/5cd574dd8065d3d7406816bee36b1ef56b3f9359/Spark_NLP_Udemy_MOOC/Open_Source/17.01.Transformers-based_Embeddings.ipynb#L102"> |
|
<img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/> |
|
</a> |
|
""" |
|
st.sidebar.markdown('Reference notebook:') |
|
st.sidebar.markdown(link, unsafe_allow_html=True) |
|
|
|
|
|
|
|
|
|
examples = [ |
|
"""Barack Obama was born in Hawaii and later became the 44th President of the United States. He attended Columbia University and Harvard Law School, where he served as the president of the Harvard Law Review. After graduation, he worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School. Obama's presidential campaign began in 2007, and he was elected as the first African American president in 2008. During his presidency, he signed into law the Affordable Care Act, passed the Dodd-Frank Act, and ordered the military operation that resulted in the death of Osama bin Laden.""", |
|
"""Paris is the capital of France and one of the most visited cities in the world. The Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral are among its most iconic landmarks. The city is also a global center for art, fashion, gastronomy, and culture. In addition to its historical sites, Paris is known for its cafés, parks, and gardens. The River Seine runs through the city, adding to its charm and providing picturesque views. Paris has been a major hub for education, politics, and commerce for centuries.""", |
|
"""Apple Inc. is an American multinational technology company headquartered in Cupertino, California. It was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976. Apple is known for its innovative products, including the iPhone, iPad, and Mac computers. The company has a significant presence in Paris, where it operates several retail stores and offices. Apple's commitment to design and user experience has made it one of the most valuable companies in the world. The company continues to lead the industry in technology and sustainability initiatives.""", |
|
"""Barack Obama est né à Hawaï et est ensuite devenu le 44e président des États-Unis. Il a étudié à l'Université Columbia et à la Faculté de droit de Harvard, où il a été président de la Harvard Law Review. Après avoir obtenu son diplôme, il a travaillé comme avocat spécialisé en droits civiques et a enseigné le droit constitutionnel à la Faculté de droit de l'Université de Chicago. La campagne présidentielle d'Obama a commencé en 2007, et il a été élu premier président afro-américain en 2008. Pendant sa présidence, il a promulgué la loi sur les soins abordables, fait adopter la loi Dodd-Frank, et ordonné l'opération militaire qui a conduit à la mort d'Oussama ben Laden.""", |
|
"""Paris est la capitale de la France et l'une des villes les plus visitées au monde. La Tour Eiffel, le Musée du Louvre et la Cathédrale Notre-Dame comptent parmi ses monuments les plus emblématiques. La ville est également un centre mondial de l'art, de la mode, de la gastronomie et de la culture. En plus de ses sites historiques, Paris est connue pour ses cafés, ses parcs et ses jardins. La Seine traverse la ville, ajoutant à son charme et offrant des vues pittoresques. Paris est depuis des siècles un important centre d'éducation, de politique et de commerce.""", |
|
"""Apple Inc. est une multinationale technologique américaine dont le siège est à Cupertino, en Californie. Elle a été fondée par Steve Jobs, Steve Wozniak et Ronald Wayne en avril 1976. Apple est connue pour ses produits innovants, notamment l'iPhone, l'iPad et les ordinateurs Mac. La société a une présence importante à Paris, où elle exploite plusieurs magasins de détail et bureaux. L'engagement d'Apple en matière de design et d'expérience utilisateur en a fait l'une des entreprises les plus précieuses au monde. La société continue de diriger l'industrie en matière de technologie et d'initiatives de durabilité.""" |
|
] |
|
|
|
selected_text = st.selectbox("Select an example", examples) |
|
custom_input = st.text_input("Try it with your own Sentence!") |
|
|
|
text_to_analyze = custom_input if custom_input else selected_text |
|
|
|
st.subheader('Full example text') |
|
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>""" |
|
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True) |
|
|
|
|
|
spark = init_spark() |
|
pipeline = create_pipeline(model) |
|
output = fit_data(pipeline, text_to_analyze) |
|
|
|
|
|
st.subheader("Processed output:") |
|
|
|
|
|
abbreviation_mapping = {'R': 'PER', 'G': 'ORG', 'C': 'LOC', 'SC': 'MISC'} |
|
|
|
|
|
results = { |
|
'Document': output[0]['document'][0].result, |
|
'NER Chunk': [n.result for n in output[0]['ner_chunk']], |
|
'NER Label': [abbreviation_mapping.get(n.metadata['entity'], 'UNKNOWN') for n in output[0]['ner_chunk']] |
|
} |
|
|
|
annotate(results) |
|
|
|
with st.expander("View DataFrame"): |
|
df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']}) |
|
df.index += 1 |
|
st.dataframe(df) |