aisyahhrazak commited on
Commit
7f2db85
·
verified ·
1 Parent(s): 68abf71

Upload 7 files

Browse files
IMG_8137.xlsx ADDED
Binary file (14.6 kB). View file
 
app.py ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline, AutoTokenizer
3
+ from classifier import MistralForSequenceClassification
4
+ import torch
5
+ import nltk
6
+ import json
7
+ import pandas as pd
8
+ import plotly.graph_objects as go
9
+ from wordcloud import WordCloud
10
+ import matplotlib.pyplot as plt
11
+ import io
12
+ import base64
13
+ from PIL import Image
14
+ from nltk import bigrams
15
+ import malaya
16
+ from collections import Counter
17
+
18
+ with open('en.json') as fopen:
19
+ en = json.load(fopen)
20
+
21
+ stopwords = malaya.text.function.get_stopwords()
22
+ stopwords = stopwords + en + ['lor', 'quote','Quote','QUOTE','pm', 'long', 'jer', 'time', 'feel', 'liao', 'wow', 'https', 'http', 've', 'ko', 'kena', 'post', 'ni', 'tu', 'don', 'je', 'jeh', 'la', 'tau', 'haha', 'hahaha', 'hahahaha']
23
+ stopwords += ['for me', 'to be', 'in the', 'me to', 'for me to']
24
+
25
+ nltk.download('punkt', quiet=True)
26
+ nltk.download('punkt_tab', quiet=True)
27
+ nltk.download('stopwords', quiet=True)
28
+ nltk.download('vader_lexicon', quiet=True)
29
+ tokenizer_tpb = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512')
30
+ model_tpb = MistralForSequenceClassification.from_pretrained('HalalFoodNLP/tpb-model-halal', torch_dtype=torch.bfloat16)
31
+ model_sentiment = MistralForSequenceClassification.from_pretrained('malaysia-ai/sentiment-mistral-191M-MLM', torch_dtype=torch.bfloat16)
32
+ pipeline_tpb = pipeline(task="text-classification", model=model_tpb, tokenizer=tokenizer_tpb)
33
+ sentiment_pipeline = pipeline("sentiment-analysis", model=model_sentiment, tokenizer=tokenizer_tpb)
34
+
35
+ data = []
36
+ with open('sentiment-tpb-dataset.jsonl', 'r') as file:
37
+ for line in file:
38
+ data.append(json.loads(line))
39
+
40
+ df = pd.DataFrame(data)
41
+
42
+ # Update the generate_wordcloud function to return a PIL Image object
43
+ def generate_wordcloud(text):
44
+ # Generate the word cloud
45
+ wordcloud = WordCloud(width=300, height=200, background_color='white').generate(text)
46
+
47
+ # Create the plot
48
+ plt.figure(figsize=(10, 5))
49
+ plt.imshow(wordcloud, interpolation='bilinear')
50
+ plt.axis('off')
51
+ plt.tight_layout(pad=0)
52
+
53
+ # Save the plot to a bytes buffer
54
+ buf = io.BytesIO()
55
+ plt.savefig(buf, format='png')
56
+ plt.close()
57
+ buf.seek(0)
58
+
59
+ # Convert bytes buffer to PIL Image
60
+ image = Image.open(buf)
61
+ return image
62
+
63
+ # Add a function to generate bigrams
64
+ def generate_bigrams(text):
65
+ words = nltk.word_tokenize(text.lower())
66
+ words = [word for word in words if word.isalnum() and word not in stopwords]
67
+ bi_grams = list(bigrams(words))
68
+ return Counter(bi_grams).most_common(10)
69
+
70
+ def predict_decision(sentiment_label):
71
+ if sentiment_label == 'positive':
72
+ return "High likelihood of purchase"
73
+ elif sentiment_label == 'neutral':
74
+ return "Moderate likelihood of purchase"
75
+ else:
76
+ return "Low likelihood of purchase"
77
+
78
+ # Function to generate report based on TPB sentiment
79
+ def generate_report(tpb_sentiment_df):
80
+ report = "## TPB Factor Analysis and Recommendations Report\n\n"
81
+
82
+ for _, row in tpb_sentiment_df.iterrows():
83
+ tpb_label = row['tpb_label']
84
+ positive_percentage = row['positive']
85
+ negative_percentage = row['negative']
86
+
87
+
88
+ if negative_percentage > 70: # Only generate recommendations for positive < 70%
89
+ if tpb_label == "attitude":
90
+ report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
91
+ report += """
92
+ **Current Issues:**
93
+ - High negative perception regarding product quality
94
+ - Concerns about halal certification and its authenticity
95
+ - Pricing issues in comparison to perceived value
96
+
97
+ **Recommended Actions:**
98
+
99
+ 1. **Quality Control Improvements**
100
+ - Implement enhanced product quality measures
101
+ - Obtain globally recognized halal certifications
102
+ - Conduct regular quality audits
103
+
104
+ 2. **Educational Campaigns**
105
+ - Educate customers on halal certification processes
106
+ - Raise awareness about the health benefits of halal products
107
+ - Highlight ethical and sustainable sourcing
108
+
109
+ 3. **Pricing Strategy Adjustment**
110
+ - Reassess pricing to align with customer expectations
111
+ - Introduce discount programs or loyalty initiatives
112
+ """
113
+ if tpb_label == "religious knowledge":
114
+ report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
115
+ report += """
116
+ **Current Issues:**
117
+ - Lack of awareness and understanding about the halal process
118
+ - Customers may be unsure of the religious guidelines followed
119
+
120
+ **Recommended Actions:**
121
+
122
+ 1. **Religious Knowledge Enhancement**
123
+ - Provide clear educational materials on the halal process
124
+ - Collaborate with religious scholars to endorse products
125
+ - Ensure transparent labeling and certification
126
+
127
+ 2. **Community Engagement**
128
+ - Host webinars or community events about halal
129
+ - Partner with local religious organizations for outreach
130
+ - Share customer testimonials emphasizing trust in your certification
131
+ """
132
+
133
+ if tpb_label == "subjective norms":
134
+ report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
135
+ report += """
136
+ **Current Issues:**
137
+ - Social influence or peer pressure regarding halal compliance is weak
138
+ - Lack of community-driven recommendations for the product
139
+
140
+ **Recommended Actions:**
141
+
142
+ 1. **Influence Social Circles**
143
+ - Engage community leaders or influencers to endorse products
144
+ - Create social campaigns around the halal certification to enhance peer recommendations
145
+
146
+ 2. **Referral Programs**
147
+ - Introduce referral programs where existing customers can promote the product
148
+ - Offer incentives for customers who share their experiences with others
149
+
150
+ 3. **Testimonials and Success Stories**
151
+ - Use customer testimonials and success stories to strengthen social trust
152
+ """
153
+
154
+ if tpb_label == "perceived behavioural control":
155
+ report += f"### {tpb_label.capitalize()} ({negative_percentage:.1f}% Negative)\n"
156
+ report += """
157
+ **Current Issues:**
158
+ - Perceived difficulty in understanding or accessing halal-certified products
159
+ - Concerns about control over product quality and sourcing transparency
160
+
161
+ **Recommended Actions:**
162
+
163
+ 1. **Improve Accessibility**
164
+ - Make halal products more accessible through multiple platforms (e-commerce, retail stores)
165
+ - Ensure ease of purchase and fast delivery options
166
+
167
+ 2. **Enhance Transparency**
168
+ - Provide detailed information about sourcing and production processes
169
+ - Use blockchain or similar technology to enhance transparency in halal certification
170
+
171
+ 3. **Customer Empowerment**
172
+ - Offer customer feedback channels to empower users to voice concerns and suggestions
173
+ - Ensure that concerns are addressed promptly to build trust and satisfaction
174
+ """
175
+
176
+ return report
177
+
178
+
179
+ def search_company(keyword):
180
+ if not keyword:
181
+ return None, None, None, None
182
+
183
+ filtered_df = df[df['text'].str.contains(keyword, case=False)]
184
+
185
+ if filtered_df.empty:
186
+ return None, None, None, None
187
+
188
+ # Calculate sentiment distribution
189
+ sentiment_counts = filtered_df['label'].value_counts(normalize=True) * 100
190
+
191
+ colors = ['red' if sentiment == 'negative' else 'gray' if sentiment == 'neutral' else 'blue' for sentiment in sentiment_counts.index]
192
+
193
+ # Create the bar plot
194
+ sentiment_fig = go.Figure(data=[go.Bar(
195
+ x=sentiment_counts.index,
196
+ y=sentiment_counts.values,
197
+ text=[f'{val:.1f}%' for val in sentiment_counts.values],
198
+ textposition='auto',
199
+ marker_color=colors
200
+ )])
201
+
202
+ sentiment_fig.update_layout(
203
+ title='Overall Sentiment Distribution',
204
+ xaxis_title='Sentiment',
205
+ yaxis_title='Percentage'
206
+ )
207
+
208
+ tpb_counts = filtered_df['tpb_label'].value_counts(normalize=True) * 100
209
+ tpb_fig = go.Figure(data=[go.Bar(
210
+ x=tpb_counts.index,
211
+ y=tpb_counts.values,
212
+ text=[f'{val:.1f}%' for val in tpb_counts.values],
213
+ textposition='auto'
214
+ )])
215
+ tpb_fig.update_layout(title='Overall TPB Factor Distribution', xaxis_title='TPB Factor', yaxis_title='Percentage')
216
+
217
+ # Calculate sentiment distribution within each TPB factor
218
+ tpb_sentiment_df = filtered_df.groupby(['tpb_label', 'label']).size().unstack(fill_value=0)
219
+ tpb_sentiment_df = tpb_sentiment_df.div(tpb_sentiment_df.sum(axis=1), axis=0) * 100
220
+
221
+ # Define colors for each sentiment
222
+ color_map = {
223
+ 'negative': 'red',
224
+ 'neutral': 'gray',
225
+ 'positive': 'blue'
226
+ }
227
+
228
+ tpb_sentiment_fig = go.Figure()
229
+ for sentiment in tpb_sentiment_df.columns:
230
+ tpb_sentiment_fig.add_trace(go.Bar(
231
+ name=sentiment,
232
+ x=tpb_sentiment_df.index,
233
+ y=tpb_sentiment_df[sentiment],
234
+ text=[f'{val:.1f}%' for val in tpb_sentiment_df[sentiment]],
235
+ textposition='auto',
236
+ marker_color=color_map.get(sentiment, 'gray')
237
+ ))
238
+
239
+ tpb_sentiment_fig.update_layout(
240
+ barmode='stack',
241
+ title='Sentiment Distribution within TPB Factors',
242
+ xaxis_title='TPB Factor',
243
+ yaxis_title='Percentage'
244
+ )
245
+
246
+ report = generate_report(tpb_sentiment_df.reset_index())
247
+
248
+ wordclouds = {}
249
+ bigrams_data = {}
250
+ for label in filtered_df['tpb_label'].unique():
251
+ text = ' '.join(filtered_df[filtered_df['tpb_label'] == label]['text']).replace('QUOTE','').replace('quote','').replace('sijil halal','').replace('halal','')
252
+ wordclouds[label] = generate_wordcloud(text)
253
+ bigrams_data[label] = generate_bigrams(text)
254
+
255
+ # Extract only the words
256
+ words_only = {
257
+ key: [word_pair for word_pair, _ in value]
258
+ for key, value in bigrams_data.items()
259
+ }
260
+ # Create a single DataFrame for bigrams, with only the bigram text (no frequency)
261
+ bigram_df = pd.DataFrame({
262
+ label: data for label, data in words_only.items()
263
+ })
264
+
265
+ print(bigrams_data.items())
266
+ bigram_df.index = [f"Top {i+1}" for i in range(len(bigram_df))]
267
+
268
+ return (sentiment_fig, tpb_fig, tpb_sentiment_fig, filtered_df[filtered_df['text'].str.len() < 300].head(5),
269
+ report, wordclouds.get('attitude'), wordclouds.get('religious knowledge'),
270
+ wordclouds.get('subjective norms'), wordclouds.get('perceived behavioural control'),bigram_df)
271
+
272
+
273
+
274
+ def text_classification_and_sentiment(text, keywords_df):
275
+ result_tpb = pipeline_tpb(text)
276
+ tpb_label = result_tpb[0]['label']
277
+ tpb_score = result_tpb[0]['score']
278
+
279
+ result_sentiment = sentiment_pipeline(text)
280
+ sentiment_label = result_sentiment[0]['label']
281
+ sentiment_score = result_sentiment[0]['score']
282
+
283
+ keywords_df = pd.read_excel('IMG_8137.xlsx')
284
+
285
+ # Check for keywords in the first column of the DataFrame
286
+ keywords = keywords_df.iloc[:, 0].tolist()
287
+ for keyword in keywords:
288
+ if keyword.lower() in text.lower():
289
+ sentiment_label = 'negative'
290
+ sentiment_score = 1.0
291
+
292
+ decision = predict_decision(sentiment_label)
293
+
294
+ tpb_output = f"TPB Label: {tpb_label}"
295
+ sentiment_output = f"Sentiment: {sentiment_label}\nProbability: {sentiment_score * 100:.2f}%"
296
+ decision_output = f"Decision: {decision}"
297
+
298
+ return tpb_output, sentiment_output, decision_output
299
+
300
+
301
+ examples = [
302
+ "Alhamdulillah, hari ni dapat makan dekat restoran halal baru. Rasa puas hati dan tenang bila tau makanan yang kita makan dijamin halal.",
303
+ "Semua orang cakap kena check logo halal sebelum beli makanan. Dah jadi macam second nature dah sekarang. Korang pun sama kan?"
304
+ ]
305
+
306
+ css = """
307
+ :root {
308
+ --bg: #FFFFFF; /* Set the background color to white */
309
+ --col: #191919; /* Define primary text color */
310
+ --bg-dark: #000000; /* Define dark background color if needed */
311
+ --col-dark: #ECF2F7; /* Define dark text color if needed */
312
+ ----body-background-fill: #FFFFFF;
313
+ }
314
+
315
+ html, body {
316
+ background-color: var(--bg); /* Set the background color to white for the entire page */
317
+ margin: 0; /* Remove default body margin */
318
+ padding: 0; /* Remove default body padding */
319
+ }
320
+
321
+ .container {
322
+ max-width: 1000px;
323
+ margin: auto;
324
+ padding: 20px;
325
+ }
326
+
327
+ .title {
328
+ text-align: center;
329
+ margin-bottom: 20px;
330
+ }
331
+
332
+ .nav-buttons {
333
+ display: flex;
334
+ justify-content: center;
335
+ gap: 10px;
336
+ margin-bottom: 20px;
337
+ }
338
+
339
+ #recommendation_report {
340
+ background-color: #f9f9f9; /* Keep this background light for the report section */
341
+ padding: 20px;
342
+ border: 2px solid #e0e0e0;
343
+ border-radius: 10px;
344
+ margin-top: 20px;
345
+ font-family: Arial, sans-serif;
346
+ font-size: 14px;
347
+ }
348
+
349
+ .wrap-text {
350
+ white-space: normal !important;
351
+ word-wrap: break-word;
352
+ }
353
+
354
+ .footer {visibility: hidden}
355
+
356
+ """
357
+
358
+ with gr.Blocks(css=css + """
359
+ body, .gradio-container, .root, .wrap, #root .background .container {
360
+ background-color: white !important;
361
+ background-image: none !important;
362
+ background-fill: white !important;
363
+ }
364
+
365
+ """, theme='aisyahhrazak/miku-aisyah@=1.2.2') as demo:
366
+
367
+ with gr.Tabs() as tabs:
368
+ with gr.TabItem("User View", id=0):
369
+ gr.Markdown("## Text Classification and Sentiment Analysis Based on User Input About Halal Food Acquisition")
370
+ gr.Markdown("Enter a text to see TPB classification, sentiment analysis, and purchase prediction results!")
371
+ input_text = gr.Textbox(lines=2, label="Input Comment", placeholder="Model can make mistakes, we are striving to improve the model.")
372
+ with gr.Row():
373
+ tpb_output = gr.Textbox(lines=3, label="TPB Classification")
374
+ sentiment_output = gr.Textbox(lines=3, label="Sentiment Analysis")
375
+ decision_output = gr.Textbox(lines=3, label="Purchase Prediction")
376
+ classify_button = gr.Button("Analyze")
377
+ classify_button.click(fn=text_classification_and_sentiment, inputs=input_text, outputs=[tpb_output, sentiment_output, decision_output])
378
+ gr.Examples(examples=examples, inputs=input_text)
379
+
380
+ with gr.TabItem("Company View", id=1):
381
+ gr.Markdown("# Sentiment Analysis and Purchase Decision Factor for Halal Food Acquisition")
382
+
383
+ input_text = gr.Textbox(lines=1, label="Search Keyword", placeholder="Enter keyword")
384
+ search_button = gr.Button("Search")
385
+
386
+ with gr.Row():
387
+ sentiment_chart = gr.Plot(label="Sentiment Distribution")
388
+ tpb_chart = gr.Plot(label="TPB Factor Distribution")
389
+
390
+ tpb_sentiment_chart = gr.Plot(label="Sentiment Distribution within TPB Factors")
391
+ # Update word cloud outputs to be in a single row
392
+ gr.Markdown("### Word Clouds by TPB Label")
393
+
394
+ with gr.Row():
395
+ attitude_wc = gr.Image(label="Attitude Word Cloud", height=200, width=300)
396
+ religious_knowledge_wc = gr.Image(label="Religious Knowledge Word Cloud", height=200, width=300)
397
+ subjective_norms_wc = gr.Image(label="Subjective Norms Word Cloud",height=200, width=300)
398
+ perceived_behavioural_control_wc = gr.Image(label="Perceived Behavioural Control Word Cloud", height=200, width=300)
399
+
400
+ with gr.Accordion("See Recommendation Details"):
401
+ report_output = gr.Markdown(label="Recommendation Report", elem_id="recommendation_report")
402
+
403
+ gr.Markdown("### Top Bigrams by TPB Label")
404
+ bigram_table = gr.Dataframe(label="Top Bigrams for Each TPB Label")
405
+
406
+ output_table = gr.Dataframe(
407
+ headers=["text", "tpb_label", "sentiment", "score"],
408
+ label="Company Analysis Results",
409
+ wrap=True
410
+ )
411
+
412
+ search_button.click(
413
+ fn=search_company,
414
+ inputs=input_text,
415
+ outputs=[
416
+ sentiment_chart, tpb_chart, tpb_sentiment_chart, output_table, report_output,
417
+ attitude_wc, religious_knowledge_wc, subjective_norms_wc, perceived_behavioural_control_wc,bigram_table
418
+ ]
419
+ )
420
+
421
+ demo.launch()
attn_mask_utils.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+ import torch
3
+ from transformers.modeling_attn_mask_utils import AttentionMaskConverter
4
+
5
+
6
+ def _prepare_4d_causal_attention_mask(
7
+ attention_mask: Optional[torch.Tensor],
8
+ input_shape: Union[torch.Size, Tuple, List],
9
+ inputs_embeds: torch.Tensor,
10
+ past_key_values_length: int,
11
+ sliding_window: Optional[int] = None,
12
+ ):
13
+ """
14
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
15
+ `(batch_size, key_value_length)`
16
+
17
+ Args:
18
+ attention_mask (`torch.Tensor` or `None`):
19
+ A 2D attention mask of shape `(batch_size, key_value_length)`
20
+ input_shape (`tuple(int)` or `list(int)` or `torch.Size`):
21
+ The input shape should be a tuple that defines `(batch_size, query_length)`.
22
+ inputs_embeds (`torch.Tensor`):
23
+ The embedded inputs as a torch Tensor.
24
+ past_key_values_length (`int`):
25
+ The length of the key value cache.
26
+ sliding_window (`int`, *optional*):
27
+ If the model uses windowed attention, a sliding window should be passed.
28
+ """
29
+ attn_mask_converter = AttentionMaskConverter(
30
+ is_causal=False, sliding_window=sliding_window
31
+ ) # is_causal=True in original implementation
32
+
33
+ key_value_length = input_shape[-1] + past_key_values_length
34
+
35
+ # 4d mask is passed through the layers
36
+ if attention_mask is not None and len(attention_mask.shape) == 2:
37
+ attention_mask = attn_mask_converter.to_4d(
38
+ attention_mask,
39
+ input_shape[-1],
40
+ key_value_length=key_value_length,
41
+ dtype=inputs_embeds.dtype,
42
+ )
43
+ elif attention_mask is not None and len(attention_mask.shape) == 4:
44
+ expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
45
+ if tuple(attention_mask.shape) != expected_shape:
46
+ raise ValueError(
47
+ f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
48
+ )
49
+ else:
50
+ # if the 4D mask has correct shape - invert it and fill with negative infinity
51
+ inverted_mask = 1.0 - attention_mask
52
+ attention_mask = inverted_mask.masked_fill(
53
+ inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
54
+ )
55
+ else:
56
+ attention_mask = attn_mask_converter.to_causal_4d(
57
+ input_shape[0],
58
+ input_shape[-1],
59
+ key_value_length,
60
+ dtype=inputs_embeds.dtype,
61
+ device=inputs_embeds.device,
62
+ )
63
+
64
+ return attention_mask
65
+
66
+
67
+ # Adapted from _prepare_4d_causal_attention_mask
68
+ def _prepare_4d_causal_attention_mask_for_sdpa(
69
+ attention_mask: Optional[torch.Tensor],
70
+ input_shape: Union[torch.Size, Tuple, List],
71
+ inputs_embeds: torch.Tensor,
72
+ past_key_values_length: int,
73
+ sliding_window: Optional[int] = None,
74
+ ):
75
+ """
76
+ Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.
77
+
78
+ In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
79
+ `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
80
+ allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
81
+ """
82
+ attn_mask_converter = AttentionMaskConverter(
83
+ is_causal=False, sliding_window=sliding_window
84
+ ) # is_causal=True in original implementation
85
+
86
+ key_value_length = input_shape[-1] + past_key_values_length
87
+ batch_size, query_length = input_shape
88
+
89
+ # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
90
+ # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
91
+ # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).
92
+ is_tracing = (
93
+ torch.jit.is_tracing()
94
+ or isinstance(inputs_embeds, torch.fx.Proxy)
95
+ or (hasattr(torch, "_dynamo") and torch._dynamo.is_compiling())
96
+ )
97
+
98
+ if attention_mask is not None:
99
+ # 4d mask is passed through
100
+ if len(attention_mask.shape) == 4:
101
+ expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
102
+ if tuple(attention_mask.shape) != expected_shape:
103
+ raise ValueError(
104
+ f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
105
+ )
106
+ else:
107
+ # if the 4D mask has correct shape - invert it and fill with negative infinity
108
+ inverted_mask = 1.0 - attention_mask.to(inputs_embeds.dtype)
109
+ attention_mask = inverted_mask.masked_fill(
110
+ inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min
111
+ )
112
+ return attention_mask
113
+
114
+ elif not is_tracing and torch.all(attention_mask == 1):
115
+ if query_length == 1:
116
+ # For query_length == 1, causal attention and bi-directional attention are the same.
117
+ attention_mask = None
118
+ elif key_value_length == query_length:
119
+ attention_mask = None
120
+ else:
121
+ # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation
122
+ # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
123
+ # Reference: https://github.com/pytorch/pytorch/issues/108108
124
+ pass
125
+ elif query_length > 1 and key_value_length != query_length:
126
+ # See the comment above (https://github.com/pytorch/pytorch/issues/108108).
127
+ # Ugly: we set it to True here to dispatch in the following controlflow to `to_causal_4d`.
128
+ attention_mask = True
129
+ elif is_tracing:
130
+ raise ValueError(
131
+ 'Attention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.'
132
+ )
133
+
134
+ if attention_mask is None:
135
+ expanded_4d_mask = None
136
+ elif attention_mask is True:
137
+ expanded_4d_mask = attn_mask_converter.to_causal_4d(
138
+ input_shape[0],
139
+ input_shape[-1],
140
+ key_value_length,
141
+ dtype=inputs_embeds.dtype,
142
+ device=inputs_embeds.device,
143
+ )
144
+ else:
145
+ expanded_4d_mask = attn_mask_converter.to_4d(
146
+ attention_mask,
147
+ input_shape[-1],
148
+ dtype=inputs_embeds.dtype,
149
+ key_value_length=key_value_length,
150
+ )
151
+
152
+ # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when
153
+ # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
154
+ # Details: https://github.com/pytorch/pytorch/issues/110213
155
+ if not is_tracing and expanded_4d_mask.device.type == "cuda":
156
+ expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
157
+ expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min
158
+ )
159
+
160
+ return expanded_4d_mask
bidirectional_mistral.py ADDED
@@ -0,0 +1,281 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Tuple, Union
2
+ import torch
3
+
4
+ from transformers import (
5
+ MistralModel,
6
+ MistralPreTrainedModel,
7
+ MistralForCausalLM,
8
+ MistralConfig,
9
+ )
10
+ from transformers.modeling_outputs import BaseModelOutputWithPast
11
+ from transformers.cache_utils import Cache, DynamicCache
12
+ from transformers.models.mistral.modeling_mistral import (
13
+ MistralDecoderLayer,
14
+ MistralRMSNorm,
15
+ MistralAttention,
16
+ MistralFlashAttention2,
17
+ MistralSdpaAttention,
18
+ MistralMLP,
19
+ )
20
+ from torch import nn
21
+ from transformers.utils import logging
22
+ from attn_mask_utils import (
23
+ _prepare_4d_causal_attention_mask,
24
+ _prepare_4d_causal_attention_mask_for_sdpa,
25
+ )
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+
30
+ class ModifiedMistralAttention(MistralAttention):
31
+ def __init__(self, *args, **kwargs):
32
+ super().__init__(*args, **kwargs)
33
+ self.is_causal = False
34
+
35
+
36
+ class ModifiedMistralFlashAttention2(MistralFlashAttention2):
37
+ def __init__(self, *args, **kwargs):
38
+ super().__init__(*args, **kwargs)
39
+ self.is_causal = False
40
+
41
+
42
+ class ModifiedMistralSdpaAttention(MistralSdpaAttention):
43
+ def __init__(self, *args, **kwargs):
44
+ super().__init__(*args, **kwargs)
45
+ self.is_causal = False
46
+
47
+
48
+ MISTRAL_ATTENTION_CLASSES = {
49
+ "eager": ModifiedMistralAttention,
50
+ "flash_attention_2": ModifiedMistralFlashAttention2,
51
+ "sdpa": ModifiedMistralSdpaAttention,
52
+ }
53
+
54
+
55
+ class ModifiedMistralDecoderLayer(MistralDecoderLayer):
56
+ def __init__(self, config: MistralConfig, layer_idx: int):
57
+ nn.Module.__init__(self)
58
+ self.hidden_size = config.hidden_size
59
+
60
+ self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](
61
+ config, layer_idx
62
+ )
63
+
64
+ self.mlp = MistralMLP(config)
65
+ self.input_layernorm = MistralRMSNorm(
66
+ config.hidden_size, eps=config.rms_norm_eps
67
+ )
68
+ self.post_attention_layernorm = MistralRMSNorm(
69
+ config.hidden_size, eps=config.rms_norm_eps
70
+ )
71
+
72
+
73
+ class MistralBiModel(MistralModel):
74
+ def __init__(self, config: MistralConfig):
75
+ MistralPreTrainedModel.__init__(self, config)
76
+ self.padding_idx = config.pad_token_id
77
+ self.vocab_size = config.vocab_size
78
+
79
+ self.embed_tokens = nn.Embedding(
80
+ config.vocab_size, config.hidden_size, self.padding_idx
81
+ )
82
+ self.layers = nn.ModuleList(
83
+ [
84
+ ModifiedMistralDecoderLayer(config, layer_idx)
85
+ for layer_idx in range(config.num_hidden_layers)
86
+ ]
87
+ )
88
+ self._attn_implementation = config._attn_implementation
89
+ self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
90
+
91
+ self.gradient_checkpointing = False
92
+ # Initialize weights and apply final processing
93
+ self.post_init()
94
+
95
+ # Copied from forward() in transformers.models.mistral.modeling_mistral.MistralModel
96
+ def forward(
97
+ self,
98
+ input_ids: torch.LongTensor = None,
99
+ attention_mask: Optional[torch.Tensor] = None,
100
+ position_ids: Optional[torch.LongTensor] = None,
101
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
102
+ inputs_embeds: Optional[torch.FloatTensor] = None,
103
+ use_cache: Optional[bool] = None,
104
+ output_attentions: Optional[bool] = None,
105
+ output_hidden_states: Optional[bool] = None,
106
+ return_dict: Optional[bool] = None,
107
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
108
+ output_attentions = (
109
+ output_attentions
110
+ if output_attentions is not None
111
+ else self.config.output_attentions
112
+ )
113
+ output_hidden_states = (
114
+ output_hidden_states
115
+ if output_hidden_states is not None
116
+ else self.config.output_hidden_states
117
+ )
118
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
119
+
120
+ return_dict = (
121
+ return_dict if return_dict is not None else self.config.use_return_dict
122
+ )
123
+
124
+ # retrieve input_ids and inputs_embeds
125
+ if input_ids is not None and inputs_embeds is not None:
126
+ raise ValueError(
127
+ "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
128
+ )
129
+ elif input_ids is not None:
130
+ batch_size, seq_length = input_ids.shape
131
+ elif inputs_embeds is not None:
132
+ batch_size, seq_length, _ = inputs_embeds.shape
133
+ else:
134
+ raise ValueError(
135
+ "You have to specify either decoder_input_ids or decoder_inputs_embeds"
136
+ )
137
+
138
+ if self.gradient_checkpointing and self.training:
139
+ if use_cache:
140
+ logger.warning_once(
141
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
142
+ )
143
+ use_cache = False
144
+
145
+ past_key_values_length = 0
146
+
147
+ if use_cache:
148
+ use_legacy_cache = not isinstance(past_key_values, Cache)
149
+ if use_legacy_cache:
150
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
151
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
152
+
153
+ if position_ids is None:
154
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
155
+ position_ids = torch.arange(
156
+ past_key_values_length,
157
+ seq_length + past_key_values_length,
158
+ dtype=torch.long,
159
+ device=device,
160
+ )
161
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
162
+ else:
163
+ position_ids = position_ids.view(-1, seq_length).long()
164
+
165
+ if inputs_embeds is None:
166
+ inputs_embeds = self.embed_tokens(input_ids)
167
+
168
+ if (
169
+ attention_mask is not None
170
+ and self._attn_implementation == "flash_attention_2"
171
+ and use_cache
172
+ ):
173
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
174
+ if is_padding_right:
175
+ raise ValueError(
176
+ "You are attempting to perform batched generation with padding_side='right'"
177
+ " this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to "
178
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
179
+ )
180
+
181
+ if self._attn_implementation == "flash_attention_2":
182
+ # 2d mask is passed through the layers
183
+ attention_mask = (
184
+ attention_mask
185
+ if (attention_mask is not None and 0 in attention_mask)
186
+ else None
187
+ )
188
+ elif self._attn_implementation == "sdpa" and not output_attentions:
189
+ # The original implementation is by-passed, see attn_mask_utils.py
190
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
191
+ attention_mask,
192
+ (batch_size, seq_length),
193
+ inputs_embeds,
194
+ past_key_values_length,
195
+ )
196
+ else:
197
+ # 4d mask is passed through the layers
198
+ attention_mask = _prepare_4d_causal_attention_mask(
199
+ attention_mask,
200
+ (batch_size, seq_length),
201
+ inputs_embeds,
202
+ past_key_values_length,
203
+ sliding_window=self.config.sliding_window,
204
+ )
205
+
206
+ hidden_states = inputs_embeds
207
+
208
+ # decoder layers
209
+ all_hidden_states = () if output_hidden_states else None
210
+ all_self_attns = () if output_attentions else None
211
+ next_decoder_cache = None
212
+
213
+ for decoder_layer in self.layers:
214
+ if output_hidden_states:
215
+ all_hidden_states += (hidden_states,)
216
+
217
+ if self.gradient_checkpointing and self.training:
218
+ layer_outputs = self._gradient_checkpointing_func(
219
+ decoder_layer.__call__,
220
+ hidden_states,
221
+ attention_mask,
222
+ position_ids,
223
+ past_key_values,
224
+ output_attentions,
225
+ use_cache,
226
+ )
227
+ else:
228
+ layer_outputs = decoder_layer(
229
+ hidden_states,
230
+ attention_mask=attention_mask,
231
+ position_ids=position_ids,
232
+ past_key_value=past_key_values,
233
+ output_attentions=output_attentions,
234
+ use_cache=use_cache,
235
+ )
236
+
237
+ hidden_states = layer_outputs[0]
238
+
239
+ if use_cache:
240
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
241
+
242
+ if output_attentions:
243
+ all_self_attns += (layer_outputs[1],)
244
+
245
+ hidden_states = self.norm(hidden_states)
246
+
247
+ # add hidden states from the last decoder layer
248
+ if output_hidden_states:
249
+ all_hidden_states += (hidden_states,)
250
+
251
+ next_cache = None
252
+ if use_cache:
253
+ next_cache = (
254
+ next_decoder_cache.to_legacy_cache()
255
+ if use_legacy_cache
256
+ else next_decoder_cache
257
+ )
258
+
259
+ if not return_dict:
260
+ return tuple(
261
+ v
262
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
263
+ if v is not None
264
+ )
265
+ return BaseModelOutputWithPast(
266
+ last_hidden_state=hidden_states,
267
+ past_key_values=next_cache,
268
+ hidden_states=all_hidden_states,
269
+ attentions=all_self_attns,
270
+ )
271
+
272
+
273
+ class MistralBiForMNTP(MistralForCausalLM):
274
+ def __init__(self, config):
275
+ MistralPreTrainedModel.__init__(self, config)
276
+ self.model = MistralBiModel(config)
277
+ self.vocab_size = config.vocab_size
278
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
279
+
280
+ # Initialize weights and apply final processing
281
+ self.post_init()
classifier.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bidirectional_mistral import MistralBiModel
2
+ from transformers import MistralPreTrainedModel
3
+ import torch
4
+ import numpy as np
5
+ from typing import Optional, List
6
+ from torch import nn
7
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
8
+ from transformers.modeling_outputs import SequenceClassifierOutputWithPast
9
+
10
+
11
+ class MistralForSequenceClassification(MistralPreTrainedModel):
12
+ def __init__(self, config):
13
+ super().__init__(config)
14
+ self.num_labels = config.num_labels
15
+ self.model = MistralBiModel(config)
16
+ self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
17
+
18
+ # Initialize weights and apply final processing
19
+ self.post_init()
20
+
21
+ def forward(
22
+ self,
23
+ input_ids: torch.LongTensor = None,
24
+ attention_mask: Optional[torch.Tensor] = None,
25
+ position_ids: Optional[torch.LongTensor] = None,
26
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
27
+ inputs_embeds: Optional[torch.FloatTensor] = None,
28
+ labels: Optional[torch.LongTensor] = None,
29
+ use_cache: Optional[bool] = None,
30
+ output_attentions: Optional[bool] = None,
31
+ output_hidden_states: Optional[bool] = None,
32
+ return_dict: Optional[bool] = None,
33
+ token_type_ids: Optional[bool] = None
34
+ ):
35
+ r"""
36
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
37
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
38
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
39
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
40
+ """
41
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
42
+
43
+ transformer_outputs = self.model(
44
+ input_ids,
45
+ attention_mask=attention_mask,
46
+ position_ids=position_ids,
47
+ past_key_values=past_key_values,
48
+ inputs_embeds=inputs_embeds,
49
+ use_cache=use_cache,
50
+ output_attentions=output_attentions,
51
+ output_hidden_states=output_hidden_states,
52
+ return_dict=return_dict,
53
+ )
54
+
55
+ pooled_output = transformer_outputs[0][:, 0]
56
+ logits = self.score(pooled_output)
57
+
58
+ loss = None
59
+ if labels is not None:
60
+ if self.config.problem_type is None:
61
+ if self.num_labels == 1:
62
+ self.config.problem_type = "regression"
63
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
64
+ self.config.problem_type = "single_label_classification"
65
+ else:
66
+ self.config.problem_type = "multi_label_classification"
67
+
68
+ if self.config.problem_type == "regression":
69
+ loss_fct = MSELoss()
70
+ if self.num_labels == 1:
71
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
72
+ else:
73
+ loss = loss_fct(logits, labels)
74
+ elif self.config.problem_type == "single_label_classification":
75
+ loss_fct = CrossEntropyLoss()
76
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
77
+ elif self.config.problem_type == "multi_label_classification":
78
+ loss_fct = BCEWithLogitsLoss()
79
+ loss = loss_fct(logits, labels)
80
+ if not return_dict:
81
+ output = (logits,) + transformer_outputs[2:]
82
+ return ((loss,) + output) if loss is not None else output
83
+
84
+ return SequenceClassifierOutputWithPast(
85
+ loss=loss,
86
+ logits=logits,
87
+ past_key_values=transformer_outputs.past_key_values,
88
+ hidden_states=transformer_outputs.hidden_states,
89
+ attentions=transformer_outputs.attentions,
90
+ )
en.json ADDED
@@ -0,0 +1 @@
 
 
1
+ ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]
sentiment-tpb-dataset.jsonl ADDED
The diff for this file is too large to render. See raw diff