Spaces:

spark-nlp
/

CamemBERT

Running

File size: 8,561 Bytes

import streamlit as st
import sparknlp
import os
import pandas as pd

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline
from sparknlp.pretrained import PretrainedPipeline
from annotated_text import annotated_text

# Page configuration
st.set_page_config(
    layout="wide", 
    initial_sidebar_state="auto"
)

# CSS for styling
st.markdown("""
    <style>
        .main-title {
            font-size: 36px;
            color: #4A90E2;
            font-weight: bold;
            text-align: center;
        }
        .section {
            background-color: #f9f9f9;
            padding: 10px;
            border-radius: 10px;
            margin-top: 10px;
        }
        .section p, .section ul {
            color: #666666;
        }
    </style>
""", unsafe_allow_html=True)

@st.cache_resource
def init_spark():
    return sparknlp.start()

@st.cache_resource
def create_pipeline(model):
    document_assembler = DocumentAssembler() \
        .setInputCol('text') \
        .setOutputCol('document')

    tokenizer = Tokenizer() \
        .setInputCols(['document']) \
        .setOutputCol('token')

    tokenClassifier = CamemBertForTokenClassification() \
        .pretrained(model, 'en') \
        .setInputCols(['document', 'token']) \
        .setOutputCol('ner') \
        .setCaseSensitive(True) \
        .setMaxSentenceLength(512)

    # Convert NER labels to entities
    ner_converter = NerConverter() \
        .setInputCols(['document', 'token', 'ner']) \
        .setOutputCol('ner_chunk')

    pipeline = Pipeline(stages=[
        document_assembler,
        tokenizer,
        tokenClassifier,
        ner_converter
    ])
    return pipeline

def fit_data(pipeline, data):
  empty_df = spark.createDataFrame([['']]).toDF('text')
  pipeline_model = pipeline.fit(empty_df)
  model = LightPipeline(pipeline_model)
  result = model.fullAnnotate(data)
  return result

def annotate(data):
    document, chunks, labels = data["Document"], data["NER Chunk"], data["NER Label"]
    annotated_words = []
    for chunk, label in zip(chunks, labels):
        parts = document.split(chunk, 1)
        if parts[0]:
            annotated_words.append(parts[0])
        annotated_words.append((chunk, label))
        document = parts[1]
    if document:
        annotated_words.append(document)
    annotated_text(*annotated_words)

# Set up the page layout
st.markdown('<div class="main-title">Recognize Entities with CamemBERT</div>', unsafe_allow_html=True)
st.markdown("""
<div class="section">
    <p>This model performs Named Entity Recognition (NER) using CamemBERT, a powerful language model fine-tuned specifically for French. It can accurately identify entities such as locations, organizations, persons, and miscellaneous categories in texts.</p>
</div>
""", unsafe_allow_html=True)

# Sidebar content
model = st.sidebar.selectbox(
    "Choose the pretrained model",
    ['camembert_base_token_classifier_wikiner'],
    help="For more info about the models visit: https://sparknlp.org/models"
)

# Reference notebook link in sidebar
link = """
<a href="https://github.com/JohnSnowLabs/spark-nlp-workshop/blob/5cd574dd8065d3d7406816bee36b1ef56b3f9359/Spark_NLP_Udemy_MOOC/Open_Source/17.01.Transformers-based_Embeddings.ipynb#L102">
    <img src="https://colab.research.google.com/assets/colab-badge.svg" style="zoom: 1.3" alt="Open In Colab"/>
</a>
"""
st.sidebar.markdown('Reference notebook:')
st.sidebar.markdown(link, unsafe_allow_html=True)

# Load examples
# English and French text samples for testing the CamemBERT model

examples = [
    """Barack Obama was born in Hawaii and later became the 44th President of the United States. He attended Columbia University and Harvard Law School, where he served as the president of the Harvard Law Review. After graduation, he worked as a civil rights attorney and taught constitutional law at the University of Chicago Law School. Obama's presidential campaign began in 2007, and he was elected as the first African American president in 2008. During his presidency, he signed into law the Affordable Care Act, passed the Dodd-Frank Act, and ordered the military operation that resulted in the death of Osama bin Laden.""",
    """Paris is the capital of France and one of the most visited cities in the world. The Eiffel Tower, Louvre Museum, and Notre-Dame Cathedral are among its most iconic landmarks. The city is also a global center for art, fashion, gastronomy, and culture. In addition to its historical sites, Paris is known for its cafés, parks, and gardens. The River Seine runs through the city, adding to its charm and providing picturesque views. Paris has been a major hub for education, politics, and commerce for centuries.""",
    """Apple Inc. is an American multinational technology company headquartered in Cupertino, California. It was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976. Apple is known for its innovative products, including the iPhone, iPad, and Mac computers. The company has a significant presence in Paris, where it operates several retail stores and offices. Apple's commitment to design and user experience has made it one of the most valuable companies in the world. The company continues to lead the industry in technology and sustainability initiatives.""",
    """Barack Obama est né à Hawaï et est ensuite devenu le 44e président des États-Unis. Il a étudié à l'Université Columbia et à la Faculté de droit de Harvard, où il a été président de la Harvard Law Review. Après avoir obtenu son diplôme, il a travaillé comme avocat spécialisé en droits civiques et a enseigné le droit constitutionnel à la Faculté de droit de l'Université de Chicago. La campagne présidentielle d'Obama a commencé en 2007, et il a été élu premier président afro-américain en 2008. Pendant sa présidence, il a promulgué la loi sur les soins abordables, fait adopter la loi Dodd-Frank, et ordonné l'opération militaire qui a conduit à la mort d'Oussama ben Laden.""",
    """Paris est la capitale de la France et l'une des villes les plus visitées au monde. La Tour Eiffel, le Musée du Louvre et la Cathédrale Notre-Dame comptent parmi ses monuments les plus emblématiques. La ville est également un centre mondial de l'art, de la mode, de la gastronomie et de la culture. En plus de ses sites historiques, Paris est connue pour ses cafés, ses parcs et ses jardins. La Seine traverse la ville, ajoutant à son charme et offrant des vues pittoresques. Paris est depuis des siècles un important centre d'éducation, de politique et de commerce.""",
    """Apple Inc. est une multinationale technologique américaine dont le siège est à Cupertino, en Californie. Elle a été fondée par Steve Jobs, Steve Wozniak et Ronald Wayne en avril 1976. Apple est connue pour ses produits innovants, notamment l'iPhone, l'iPad et les ordinateurs Mac. La société a une présence importante à Paris, où elle exploite plusieurs magasins de détail et bureaux. L'engagement d'Apple en matière de design et d'expérience utilisateur en a fait l'une des entreprises les plus précieuses au monde. La société continue de diriger l'industrie en matière de technologie et d'initiatives de durabilité."""
]

selected_text = st.selectbox("Select an example", examples)
custom_input = st.text_input("Try it with your own Sentence!")

text_to_analyze = custom_input if custom_input else selected_text

st.subheader('Full example text')
HTML_WRAPPER = """<div class="scroll entities" style="overflow-x: auto; border: 1px solid #e6e9ef; border-radius: 0.25rem; padding: 1rem; margin-bottom: 2.5rem; white-space:pre-wrap">{}</div>"""
st.markdown(HTML_WRAPPER.format(text_to_analyze), unsafe_allow_html=True)

# Initialize Spark and create pipeline
spark = init_spark()
pipeline = create_pipeline(model)
output = fit_data(pipeline, text_to_analyze)

# Display matched sentence
st.subheader("Processed output:")

# Define the correct abbreviation mapping
abbreviation_mapping = {'R': 'PER', 'G': 'ORG', 'C': 'LOC', 'SC': 'MISC'}

# Extract the NER results
results = {
    'Document': output[0]['document'][0].result,
    'NER Chunk': [n.result for n in output[0]['ner_chunk']],
    'NER Label': [abbreviation_mapping.get(n.metadata['entity'], 'UNKNOWN') for n in output[0]['ner_chunk']]
}

annotate(results)

with st.expander("View DataFrame"):
    df = pd.DataFrame({'NER Chunk': results['NER Chunk'], 'NER Label': results['NER Label']})
    df.index += 1
    st.dataframe(df)