Spaces:
Sleeping
Sleeping
eaglelandsonce
commited on
Update pages/21_NLP.py
Browse files- pages/21_NLP.py +67 -70
pages/21_NLP.py
CHANGED
@@ -1,85 +1,82 @@
|
|
1 |
import streamlit as st
|
2 |
-
import tensorflow as tf
|
3 |
-
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
4 |
import numpy as np
|
|
|
5 |
import matplotlib.pyplot as plt
|
|
|
|
|
|
|
|
|
|
|
6 |
from sklearn.model_selection import train_test_split
|
|
|
|
|
7 |
|
8 |
-
# Load the
|
9 |
-
|
10 |
-
|
11 |
-
# Load dataset
|
12 |
-
dataset = load_dataset("imdb")
|
13 |
|
14 |
-
#
|
15 |
-
|
16 |
-
|
17 |
-
# Tokenizer parameters
|
18 |
-
vocab_size = 10000
|
19 |
-
max_length = 128
|
20 |
embedding_dim = 128
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
tokenizer.fit_on_texts(train_data['text'].values)
|
25 |
-
word_index = tokenizer.word_index
|
26 |
-
|
27 |
-
# Convert text to sequences
|
28 |
-
X_train = tokenizer.texts_to_sequences(train_data['text'].values)
|
29 |
-
X_test = tokenizer.texts_to_sequences(test_data['text'].values)
|
30 |
|
31 |
-
#
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
34 |
|
35 |
-
|
36 |
-
y_train = train_data['label'].values
|
37 |
-
y_test = test_data['label'].values
|
38 |
|
39 |
-
#
|
40 |
-
|
41 |
-
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
|
42 |
-
tf.keras.layers.LSTM(64, return_sequences=True),
|
43 |
-
tf.keras.layers.LSTM(32),
|
44 |
-
tf.keras.layers.Dense(24, activation='relu'),
|
45 |
-
tf.keras.layers.Dense(1, activation='sigmoid')
|
46 |
-
])
|
47 |
|
48 |
-
|
49 |
|
50 |
-
|
51 |
-
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
|
52 |
|
53 |
-
#
|
54 |
-
history = model.fit(X_train, y_train, epochs=3, validation_split=0.1, batch_size=32)
|
55 |
-
|
56 |
-
# Evaluate the model
|
57 |
loss, accuracy = model.evaluate(X_test, y_test)
|
58 |
-
st.write(f'Test Accuracy: {accuracy}')
|
59 |
-
|
60 |
-
#
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
st.pyplot(
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
st.pyplot(
|
78 |
-
|
79 |
-
#
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
|
|
|
|
2 |
import numpy as np
|
3 |
+
import pandas as pd
|
4 |
import matplotlib.pyplot as plt
|
5 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
6 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
7 |
+
from tensorflow.keras.models import Sequential
|
8 |
+
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
|
9 |
+
from tensorflow.keras.callbacks import EarlyStopping
|
10 |
from sklearn.model_selection import train_test_split
|
11 |
+
from sklearn.metrics import accuracy_score, confusion_matrix
|
12 |
+
from tensorflow.keras.datasets import imdb
|
13 |
|
14 |
+
# Load the dataset
|
15 |
+
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=5000)
|
|
|
|
|
|
|
16 |
|
17 |
+
# Data Preprocessing
|
18 |
+
max_words = 500
|
19 |
+
max_len = 500
|
|
|
|
|
|
|
20 |
embedding_dim = 128
|
21 |
|
22 |
+
X_train = pad_sequences(X_train, maxlen=max_len)
|
23 |
+
X_test = pad_sequences(X_test, maxlen=max_len)
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
# Build the Model
|
26 |
+
model = Sequential()
|
27 |
+
model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=max_len))
|
28 |
+
model.add(SpatialDropout1D(0.2))
|
29 |
+
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
|
30 |
+
model.add(Dense(1, activation='sigmoid'))
|
31 |
|
32 |
+
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
|
|
|
|
|
33 |
|
34 |
+
# Train the Model
|
35 |
+
X_train_partial, X_val, y_train_partial, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
+
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
|
38 |
|
39 |
+
history = model.fit(X_train_partial, y_train_partial, epochs=10, batch_size=64, validation_data=(X_val, y_val), callbacks=[early_stopping])
|
|
|
40 |
|
41 |
+
# Evaluate the Model
|
|
|
|
|
|
|
42 |
loss, accuracy = model.evaluate(X_test, y_test)
|
43 |
+
st.write(f'Test Accuracy: {accuracy:.4f}')
|
44 |
+
|
45 |
+
# Plotting functions
|
46 |
+
def plot_accuracy(history):
|
47 |
+
plt.plot(history.history['accuracy'])
|
48 |
+
plt.plot(history.history['val_accuracy'])
|
49 |
+
plt.title('Model accuracy')
|
50 |
+
plt.ylabel('Accuracy')
|
51 |
+
plt.xlabel('Epoch')
|
52 |
+
plt.legend(['Train', 'Validation'], loc='upper left')
|
53 |
+
st.pyplot(plt)
|
54 |
+
|
55 |
+
def plot_loss(history):
|
56 |
+
plt.plot(history.history['loss'])
|
57 |
+
plt.plot(history.history['val_loss'])
|
58 |
+
plt.title('Model loss')
|
59 |
+
plt.ylabel('Loss')
|
60 |
+
plt.xlabel('Epoch')
|
61 |
+
plt.legend(['Train', 'Validation'], loc='upper left')
|
62 |
+
st.pyplot(plt)
|
63 |
+
|
64 |
+
# Display plots
|
65 |
+
plot_accuracy(history)
|
66 |
+
plot_loss(history)
|
67 |
+
|
68 |
+
# Text Input and Prediction
|
69 |
+
st.header("Movie Review Sentiment Analysis")
|
70 |
+
review_input = st.text_area("Enter your movie review:", "This movie was fantastic! I loved it.")
|
71 |
+
|
72 |
+
# Tokenization and padding
|
73 |
+
tokenizer = Tokenizer(num_words=5000)
|
74 |
+
tokenizer.fit_on_texts(review_input)
|
75 |
+
review_seq = tokenizer.texts_to_sequences([review_input])
|
76 |
+
review_pad = pad_sequences(review_seq, maxlen=max_len)
|
77 |
+
|
78 |
+
# Prediction
|
79 |
+
if st.button("Classify Review"):
|
80 |
+
prediction = (model.predict(review_pad) > 0.5).astype("int32")
|
81 |
+
sentiment = "Positive" if prediction[0][0] == 1 else "Negative"
|
82 |
+
st.write(f'Sentiment: **{sentiment}**')
|