Spaces:
Running
Running
abdullahmubeen10
commited on
Update pages/Workflow & Model Overview.py
Browse files- pages/Workflow & Model Overview.py +231 -233
pages/Workflow & Model Overview.py
CHANGED
@@ -1,233 +1,231 @@
|
|
1 |
-
import streamlit as st
|
2 |
-
|
3 |
-
# Custom CSS for better styling
|
4 |
-
st.markdown("""
|
5 |
-
<style>
|
6 |
-
.main-title {
|
7 |
-
font-size: 36px;
|
8 |
-
color: #4A90E2;
|
9 |
-
font-weight: bold;
|
10 |
-
text-align: center;
|
11 |
-
}
|
12 |
-
.sub-title {
|
13 |
-
font-size: 24px;
|
14 |
-
color: #4A90E2;
|
15 |
-
margin-top: 20px;
|
16 |
-
}
|
17 |
-
.section {
|
18 |
-
background-color: #f9f9f9;
|
19 |
-
padding: 15px;
|
20 |
-
border-radius: 10px;
|
21 |
-
margin-top: 20px;
|
22 |
-
}
|
23 |
-
.section h2 {
|
24 |
-
font-size: 22px;
|
25 |
-
color: #4A90E2;
|
26 |
-
}
|
27 |
-
.section p, .section ul {
|
28 |
-
color: #666666;
|
29 |
-
}
|
30 |
-
.link {
|
31 |
-
color: #4A90E2;
|
32 |
-
text-decoration: none;
|
33 |
-
}
|
34 |
-
</style>
|
35 |
-
""", unsafe_allow_html=True)
|
36 |
-
|
37 |
-
# Introduction
|
38 |
-
st.markdown('<div class="main-title">Part-of-Speech Tagging with Spark NLP</div>', unsafe_allow_html=True)
|
39 |
-
|
40 |
-
st.markdown("""
|
41 |
-
<div class="section">
|
42 |
-
<p>
|
43 |
-
</div>
|
44 |
-
""", unsafe_allow_html=True)
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
<
|
52 |
-
<
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
<
|
66 |
-
<
|
67 |
-
|
68 |
-
|
69 |
-
<li>
|
70 |
-
<li>
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
""
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
st.
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
""
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
from
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
result.
|
134 |
-
|
135 |
-
|
136 |
-
).alias("
|
137 |
-
).
|
138 |
-
F.expr("cols['
|
139 |
-
F.expr("cols['
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
+------------+-----+---+---+
|
147 |
-
|
|
148 |
-
|
149 |
-
|
|
150 |
-
|
|
151 |
-
|
|
152 |
-
|
|
153 |
-
|
|
154 |
-
|
|
155 |
-
|
|
156 |
-
|
|
157 |
-
|
|
158 |
-
|' |
|
159 |
-
|
|
160 |
-
|
|
161 |
-
|
|
162 |
-
|
|
163 |
-
|
|
164 |
-
|
|
165 |
-
|
|
166 |
-
|
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
"""
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
<
|
180 |
-
<p>
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
<
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
st.markdown(
|
208 |
-
|
209 |
-
|
210 |
-
<
|
211 |
-
|
212 |
-
<li
|
213 |
-
<li>
|
214 |
-
<li>
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
""
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
<
|
224 |
-
|
225 |
-
<li><a class="link" href="https://
|
226 |
-
<li><a class="link" href="https://
|
227 |
-
<li><a class="link" href="https://
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
""", unsafe_allow_html=True)
|
233 |
-
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
# Custom CSS for better styling
|
4 |
+
st.markdown("""
|
5 |
+
<style>
|
6 |
+
.main-title {
|
7 |
+
font-size: 36px;
|
8 |
+
color: #4A90E2;
|
9 |
+
font-weight: bold;
|
10 |
+
text-align: center;
|
11 |
+
}
|
12 |
+
.sub-title {
|
13 |
+
font-size: 24px;
|
14 |
+
color: #4A90E2;
|
15 |
+
margin-top: 20px;
|
16 |
+
}
|
17 |
+
.section {
|
18 |
+
background-color: #f9f9f9;
|
19 |
+
padding: 15px;
|
20 |
+
border-radius: 10px;
|
21 |
+
margin-top: 20px;
|
22 |
+
}
|
23 |
+
.section h2 {
|
24 |
+
font-size: 22px;
|
25 |
+
color: #4A90E2;
|
26 |
+
}
|
27 |
+
.section p, .section ul {
|
28 |
+
color: #666666;
|
29 |
+
}
|
30 |
+
.link {
|
31 |
+
color: #4A90E2;
|
32 |
+
text-decoration: none;
|
33 |
+
}
|
34 |
+
</style>
|
35 |
+
""", unsafe_allow_html=True)
|
36 |
+
|
37 |
+
# Introduction
|
38 |
+
st.markdown('<div class="main-title">Part-of-Speech Tagging with Spark NLP</div>', unsafe_allow_html=True)
|
39 |
+
|
40 |
+
st.markdown("""
|
41 |
+
<div class="section">
|
42 |
+
<p>Part-of-Speech (POS) tagging is an essential task in Natural Language Processing (NLP) that involves identifying the grammatical roles of words within a text, such as nouns, verbs, adjectives, and more. This app demonstrates how to use the PerceptronModel annotator to perform POS tagging in text data using Spark NLP.</p>
|
43 |
+
</div>
|
44 |
+
""", unsafe_allow_html=True)
|
45 |
+
|
46 |
+
# About POS Tagging
|
47 |
+
st.markdown('<div class="sub-title">About Part-of-Speech Tagging</div>', unsafe_allow_html=True)
|
48 |
+
st.markdown("""
|
49 |
+
<div class="section">
|
50 |
+
<p>Part-of-Speech (POS) tagging involves assigning each word in a sentence its grammatical role, such as subject, verb, or adjective. This technique helps improve many NLP tasks, including Named Entity Recognition (NER), Word Sense Disambiguation (WSD), Question Answering (QA), and Dependency Parsing (DP).</p>
|
51 |
+
<p>For instance, knowing that a word is an adjective increases the likelihood that one of the neighboring words is a noun. The context can also alter the meaning of words significantly:</p>
|
52 |
+
<ul>
|
53 |
+
<li><i>What is your address?</i> (noun)</li>
|
54 |
+
<li><i>I will address this issue today.</i> (verb)</li>
|
55 |
+
</ul>
|
56 |
+
<p>POS tags can be categorized using schemas like the Universal Dependencies schema or the Penn Treebank POS tags. Each schema provides a set of tags for different grammatical roles.</p>
|
57 |
+
</div>
|
58 |
+
""", unsafe_allow_html=True)
|
59 |
+
|
60 |
+
# Using PerceptronModel for POS Tagging
|
61 |
+
st.markdown('<div class="sub-title">Using PerceptronModel for POS Tagging in Spark NLP</div>', unsafe_allow_html=True)
|
62 |
+
st.markdown("""
|
63 |
+
<div class="section">
|
64 |
+
<p>The PerceptronModel annotator in Spark NLP allows users to perform POS tagging with high accuracy using pretrained models. This annotator can identify and label the grammatical roles of words in text data, providing valuable insights for various applications.</p>
|
65 |
+
<p>The PerceptronModel annotator in Spark NLP offers:</p>
|
66 |
+
<ul>
|
67 |
+
<li>Accurate POS tagging using pretrained models</li>
|
68 |
+
<li>Identification and labeling of grammatical roles</li>
|
69 |
+
<li>Efficient processing of large text datasets</li>
|
70 |
+
<li>Integration with other Spark NLP components for comprehensive NLP pipelines</li>
|
71 |
+
</ul>
|
72 |
+
</div>
|
73 |
+
""", unsafe_allow_html=True)
|
74 |
+
|
75 |
+
st.markdown('<h2 class="sub-title">Example Usage in Python</h2>', unsafe_allow_html=True)
|
76 |
+
st.markdown('<p>Here’s how you can implement POS tagging using the PerceptronModel annotator in Spark NLP:</p>', unsafe_allow_html=True)
|
77 |
+
|
78 |
+
# Setup Instructions
|
79 |
+
st.markdown('<div class="sub-title">Setup</div>', unsafe_allow_html=True)
|
80 |
+
st.markdown('<p>To install Spark NLP in Python, use your favorite package manager (conda, pip, etc.). For example:</p>', unsafe_allow_html=True)
|
81 |
+
st.code("""
|
82 |
+
pip install spark-nlp
|
83 |
+
pip install pyspark
|
84 |
+
""", language="bash")
|
85 |
+
|
86 |
+
st.markdown("<p>Then, import Spark NLP and start a Spark session:</p>", unsafe_allow_html=True)
|
87 |
+
st.code("""
|
88 |
+
import sparknlp
|
89 |
+
|
90 |
+
# Start Spark Session
|
91 |
+
spark = sparknlp.start()
|
92 |
+
""", language='python')
|
93 |
+
|
94 |
+
# POS Tagging Example
|
95 |
+
st.markdown('<div class="sub-title">Example Usage: POS Tagging with PerceptronModel</div>', unsafe_allow_html=True)
|
96 |
+
st.code('''
|
97 |
+
from sparknlp.base import DocumentAssembler
|
98 |
+
from sparknlp.annotator import Tokenizer, PerceptronModel
|
99 |
+
from pyspark.ml import Pipeline
|
100 |
+
import pyspark.sql.functions as F
|
101 |
+
|
102 |
+
# Stage 1: Transforms raw texts to document annotation
|
103 |
+
document_assembler = DocumentAssembler() \\
|
104 |
+
.setInputCol("text") \\
|
105 |
+
.setOutputCol("document")
|
106 |
+
|
107 |
+
# Stage 2: Tokenization
|
108 |
+
tokenizer = Tokenizer() \\
|
109 |
+
.setInputCols(["document"]) \\
|
110 |
+
.setOutputCol("token")
|
111 |
+
|
112 |
+
# Stage 3: Perceptron model for POS Tagger
|
113 |
+
# Pretrained model pos_anc for texts in English
|
114 |
+
postagger = PerceptronModel.pretrained("pos_anc", "en") \\
|
115 |
+
.setInputCols(["document", "token"]) \\
|
116 |
+
.setOutputCol("pos")
|
117 |
+
|
118 |
+
# Define the pipeline
|
119 |
+
pipeline = Pipeline(stages=[document_assembler, tokenizer, postagger])
|
120 |
+
|
121 |
+
# Create the dataframe
|
122 |
+
data = spark.createDataFrame([["Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul"]]).toDF("text")
|
123 |
+
|
124 |
+
# Fit the dataframe to the pipeline to get the model
|
125 |
+
model = pipeline.fit(data)
|
126 |
+
|
127 |
+
# Transform the data to get predictions
|
128 |
+
result = model.transform(data)
|
129 |
+
|
130 |
+
# Display the POS tags
|
131 |
+
result.select(
|
132 |
+
F.explode(
|
133 |
+
F.arrays_zip(result.token.result, result.token.begin, result.token.end, result.pos.result)
|
134 |
+
).alias("cols")
|
135 |
+
).select(
|
136 |
+
F.expr("cols['0']").alias("token"),
|
137 |
+
F.expr("cols['1']").alias("begin"),
|
138 |
+
F.expr("cols['2']").alias("end"),
|
139 |
+
F.expr("cols['3']").alias("pos"),
|
140 |
+
).show(truncate=False)
|
141 |
+
''', language='python')
|
142 |
+
|
143 |
+
st.text("""
|
144 |
+
+------------+-----+---+---+
|
145 |
+
|token |begin|end|pos|
|
146 |
+
+------------+-----+---+---+
|
147 |
+
|Unions |0 |5 |NNP|
|
148 |
+
|representing|7 |18 |VBG|
|
149 |
+
|workers |20 |26 |NNS|
|
150 |
+
|at |28 |29 |IN |
|
151 |
+
|Turner |31 |36 |NNP|
|
152 |
+
|Newall |38 |43 |NNP|
|
153 |
+
|say |45 |47 |VBP|
|
154 |
+
|they |49 |52 |PRP|
|
155 |
+
|are |54 |56 |VBP|
|
156 |
+
|' |58 |58 |POS|
|
157 |
+
|disappointed|59 |70 |JJ |
|
158 |
+
|' |71 |71 |POS|
|
159 |
+
|after |73 |77 |IN |
|
160 |
+
|talks |79 |83 |NNS|
|
161 |
+
|with |85 |88 |IN |
|
162 |
+
|stricken |90 |97 |NN |
|
163 |
+
|parent |99 |104|NN |
|
164 |
+
|firm |106 |109|NN |
|
165 |
+
|Federal |111 |117|NNP|
|
166 |
+
|Mogul |119 |123|NNP|
|
167 |
+
+------------+-----+---+---+
|
168 |
+
""")
|
169 |
+
|
170 |
+
st.markdown("""
|
171 |
+
<p>The code snippet demonstrates how to set up a pipeline in Spark NLP to perform POS tagging on text data using the PerceptronModel annotator. The resulting DataFrame contains the tokens and their corresponding POS tags.</p>
|
172 |
+
""", unsafe_allow_html=True)
|
173 |
+
|
174 |
+
# One-liner Alternative
|
175 |
+
st.markdown('<div class="sub-title">One-liner Alternative</div>', unsafe_allow_html=True)
|
176 |
+
st.markdown("""
|
177 |
+
<div class="section">
|
178 |
+
<p>In October 2022, John Snow Labs released the open-source <code>johnsnowlabs</code> library that contains all the company products, open-source and licensed, under one common library. This simplified the workflow, especially for users working with more than one of the libraries (e.g., Spark NLP + Healthcare NLP). This new library is a wrapper on all of John Snow Lab’s libraries and can be installed with pip:</p>
|
179 |
+
<p><code>pip install johnsnowlabs</code></p>
|
180 |
+
<p>To run POS tagging with one line of code, we can simply:</p>
|
181 |
+
</div>
|
182 |
+
""", unsafe_allow_html=True)
|
183 |
+
st.code("""
|
184 |
+
# Import the NLP module which contains Spark NLP and NLU libraries
|
185 |
+
from johnsnowlabs import nlp
|
186 |
+
|
187 |
+
example_sentence = "Unions representing workers at Turner Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul"
|
188 |
+
|
189 |
+
# Returns a pandas DataFrame, we select the desired columns
|
190 |
+
nlp.load("pos").predict(example_sentence)[['token','pos']]
|
191 |
+
""", language='python')
|
192 |
+
|
193 |
+
st.image('images/johnsnowlabs-output.png', use_column_width='auto')
|
194 |
+
|
195 |
+
# Summary
|
196 |
+
st.markdown('<div class="sub-title">Summary</div>', unsafe_allow_html=True)
|
197 |
+
st.markdown("""
|
198 |
+
<div class="section">
|
199 |
+
<p>In this demo app, we showcased how to perform Part-of-Speech tagging using the PerceptronModel annotator in Spark NLP. POS tagging is a crucial step in many NLP applications, helping to understand the grammatical structure and context of the text. With Spark NLP, you can efficiently process and analyze large volumes of text data, leveraging powerful pretrained models for accurate and reliable results.</p>
|
200 |
+
<p>We hope you found this demo helpful and encourage you to explore more features and capabilities of Spark NLP for your NLP projects!</p>
|
201 |
+
</div>
|
202 |
+
""", unsafe_allow_html=True)
|
203 |
+
|
204 |
+
# References and Additional Information
|
205 |
+
st.markdown('<div class="sub-title">For additional information, please check the following references.</div>', unsafe_allow_html=True)
|
206 |
+
|
207 |
+
st.markdown("""
|
208 |
+
<div class="section">
|
209 |
+
<ul>
|
210 |
+
<li><a href="https://nlp.johnsnowlabs.com/docs/en/annotators" target="_blank" rel="noopener">Spark NLP documentation page</a> for all available annotators</li>
|
211 |
+
<li>Python API documentation for <a href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/pos/perceptron/index.html#sparknlp.annotator.pos.perceptron.PerceptronModel" target="_blank" rel="noopener">PerceptronModel</a> and <a href="https://nlp.johnsnowlabs.com/api/python/reference/autosummary/sparknlp/annotator/dependency/dependency_parser/index.html#sparknlp.annotator.dependency.dependency_parser.DependencyParserModel" target="_blank" rel="noopener">Dependency Parser</a></li>
|
212 |
+
<li>Scala API documentation for <a href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/pos/perceptron/PerceptronModel.html" target="_blank" rel="noopener">PerceptronModel</a> and <a href="https://nlp.johnsnowlabs.com/api/com/johnsnowlabs/nlp/annotators/parser/dep/DependencyParserModel.html" target="_blank" rel="noopener">DependencyParserModel</a></li>
|
213 |
+
<li>For extended examples of usage of Spark NLP annotators, check the <a href="https://github.com/JohnSnowLabs/spark-nlp-workshop" target="_blank" rel="noopener">Spark NLP Workshop repository</a>.</li>
|
214 |
+
<li>Minsky, M.L. and Papert, S.A. (1969) Perceptrons. MIT Press, Cambridge.</li>
|
215 |
+
</ul>
|
216 |
+
</div>
|
217 |
+
""", unsafe_allow_html=True)
|
218 |
+
|
219 |
+
st.markdown('<div class="sub-title">Community & Support</div>', unsafe_allow_html=True)
|
220 |
+
st.markdown("""
|
221 |
+
<div class="section">
|
222 |
+
<ul>
|
223 |
+
<li><a class="link" href="https://sparknlp.org/" target="_blank">Official Website</a>: Documentation and examples</li>
|
224 |
+
<li><a class="link" href="https://join.slack.com/t/spark-nlp/shared_invite/zt-198dipu77-L3UWNe_AJ8xqDk0ivmih5Q" target="_blank">Slack</a>: Live discussion with the community and team</li>
|
225 |
+
<li><a class="link" href="https://github.com/JohnSnowLabs/spark-nlp" target="_blank">GitHub</a>: Bug reports, feature requests, and contributions</li>
|
226 |
+
<li><a class="link" href="https://medium.com/spark-nlp" target="_blank">Medium</a>: Spark NLP articles</li>
|
227 |
+
<li><a class="link" href="https://www.youtube.com/channel/UCmFOjlpYEhxf_wJUDuz6xxQ/videos" target="_blank">YouTube</a>: Video tutorials</li>
|
228 |
+
</ul>
|
229 |
+
</div>
|
230 |
+
""", unsafe_allow_html=True)
|
231 |
+
|
|
|
|