pavlyhalim commited on
Commit
1d03764
·
1 Parent(s): 1a27bf2

Add application and model files

Browse files
Files changed (3) hide show
  1. app.py +245 -0
  2. model.joblib +3 -0
  3. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import joblib
5
+ import plotly.graph_objects as go
6
+ from sklearn.base import BaseEstimator, ClassifierMixin
7
+ from sklearn.preprocessing import RobustScaler, LabelEncoder
8
+ from sklearn.feature_selection import SelectFromModel
9
+ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
10
+ import xgboost as xgb
11
+ from sklearn.linear_model import LogisticRegression
12
+ import time
13
+ from datetime import datetime
14
+
15
+ class OptimizedStackedClassifier(BaseEstimator, ClassifierMixin):
16
+ def __init__(self):
17
+ self.scaler = RobustScaler()
18
+ self.label_encoder = LabelEncoder()
19
+ self.feature_selector = None
20
+ self.base_models = None
21
+ self.meta_model = None
22
+ self.selected_features = None
23
+ self.start_time = time.time()
24
+
25
+ def predict(self, X):
26
+ """Make predictions using optimized pipeline"""
27
+ # Scale and select features
28
+ X_scaled = pd.DataFrame(
29
+ self.scaler.transform(X),
30
+ columns=X.columns
31
+ )
32
+ X_selected = X_scaled[self.selected_features]
33
+
34
+ # Generate meta-features
35
+ meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6))
36
+ for i, (name, model) in enumerate(self.base_models):
37
+ predictions = model.predict_proba(X_selected)
38
+ meta_features[:, i*6:(i+1)*6] = predictions
39
+
40
+ # Make final predictions
41
+ predictions = self.meta_model.predict(meta_features)
42
+ return self.label_encoder.inverse_transform(predictions)
43
+
44
+ def predict_proba(self, X):
45
+ """Get prediction probabilities"""
46
+ # Scale and select features
47
+ X_scaled = pd.DataFrame(
48
+ self.scaler.transform(X),
49
+ columns=X.columns
50
+ )
51
+ X_selected = X_scaled[self.selected_features]
52
+
53
+ # Generate meta-features
54
+ meta_features = np.zeros((X_selected.shape[0], len(self.base_models) * 6))
55
+ for i, (name, model) in enumerate(self.base_models):
56
+ predictions = model.predict_proba(X_selected)
57
+ meta_features[:, i*6:(i+1)*6] = predictions
58
+
59
+ return self.meta_model.predict_proba(meta_features)
60
+
61
+ def load_model(model_path):
62
+ """Load the saved model"""
63
+ try:
64
+ return joblib.load(model_path)
65
+ except Exception as e:
66
+ st.error(f"Error loading model: {str(e)}")
67
+ return None
68
+
69
+ def create_features(input_data):
70
+ """Create features matching the model's exact feature names"""
71
+ features = {
72
+ 'chars_original': input_data['chars_original'],
73
+ 'chars_tokenized': input_data['chars_tokenized'],
74
+ 'num_words': input_data['num_words'],
75
+ 'num_tokens': input_data['num_tokens'],
76
+ 'unique_tokens': input_data['unique_tokens'],
77
+ 'type_token_ratio': input_data['type_token_ratio'],
78
+ 'fertility': input_data['fertility'],
79
+ 'token_std': input_data['token_std'],
80
+ 'avg_token_len': input_data['avg_token_len']
81
+ }
82
+
83
+ # Add derived features
84
+ eps = 1e-10
85
+ features['chars_per_word'] = features['chars_original'] / (features['num_words'] + eps)
86
+ features['chars_per_token'] = features['chars_tokenized'] / (features['num_tokens'] + eps)
87
+ features['tokens_per_word'] = features['num_tokens'] / (features['num_words'] + eps)
88
+ features['token_complexity'] = features['token_std'] * features['avg_token_len']
89
+ features['lexical_density'] = features['unique_tokens'] / (features['num_words'] + eps)
90
+ features['log_chars'] = np.log1p(features['chars_original'])
91
+ features['complexity_score'] = (
92
+ features['token_complexity'] *
93
+ features['lexical_density'] *
94
+ features['type_token_ratio']
95
+ )
96
+
97
+ return pd.DataFrame([features])
98
+
99
+ def plot_probabilities(probabilities):
100
+ """Create a bar plot of prediction probabilities"""
101
+ fig = go.Figure(data=[
102
+ go.Bar(
103
+ x=[f'Level {i+1}' for i in range(len(probabilities))],
104
+ y=probabilities,
105
+ text=np.round(probabilities, 3),
106
+ textposition='auto'
107
+ )
108
+ ])
109
+ fig.update_layout(
110
+ title='Probability Distribution Across Readability Levels',
111
+ xaxis_title='Readability Level',
112
+ yaxis_title='Probability',
113
+ yaxis_range=[0, 1],
114
+ height=400
115
+ )
116
+ return fig
117
+
118
+ def plot_feature_values(features_df):
119
+ """Create a bar plot of feature values"""
120
+ fig = go.Figure(data=[
121
+ go.Bar(
122
+ x=features_df.columns,
123
+ y=features_df.values[0],
124
+ text=np.round(features_df.values[0], 2),
125
+ textposition='auto'
126
+ )
127
+ ])
128
+ fig.update_layout(
129
+ title='Feature Values',
130
+ xaxis_title='Features',
131
+ yaxis_title='Value',
132
+ xaxis_tickangle=-45,
133
+ height=500
134
+ )
135
+ return fig
136
+
137
+ def main():
138
+ st.set_page_config(page_title="Text Readability Classifier", layout="wide")
139
+
140
+ st.title("Text Readability Classifier")
141
+ st.write("This app predicts the readability level based on text characteristics.")
142
+
143
+ # Load the model
144
+ model_path = "/Users/pavly/Downloads/saved_models/stacked_classifier_20241124_213512.joblib"
145
+ model = load_model(model_path)
146
+
147
+ if model is None:
148
+ st.error("Could not load the model. Please check if the model file exists.")
149
+ return
150
+
151
+ # Create two columns for layout
152
+ col1, col2 = st.columns([2, 1])
153
+
154
+ with col1:
155
+ # Input form for text characteristics
156
+ st.subheader("Enter Text Characteristics")
157
+
158
+ # Basic features input
159
+ input_data = {}
160
+ input_data['chars_original'] = st.number_input('Number of Characters (Original)', value=0)
161
+ input_data['chars_tokenized'] = st.number_input('Number of Characters (Tokenized)', value=0)
162
+ input_data['num_words'] = st.number_input('Number of Words', value=0)
163
+ input_data['num_tokens'] = st.number_input('Number of Tokens', value=0)
164
+ input_data['unique_tokens'] = st.number_input('Number of Unique Tokens', value=0)
165
+ input_data['type_token_ratio'] = st.number_input('Type-Token Ratio', value=0.0, min_value=0.0, max_value=1.0)
166
+ input_data['fertility'] = st.number_input('Fertility', value=0.0)
167
+ input_data['token_std'] = st.number_input('Token Standard Deviation', value=0.0)
168
+ input_data['avg_token_len'] = st.number_input('Average Token Length', value=0.0)
169
+
170
+ analyze_button = st.button("Analyze", type="primary")
171
+
172
+ if analyze_button:
173
+ with st.spinner("Analyzing..."):
174
+ try:
175
+ # Create features dataframe with all required features
176
+ features_df = create_features(input_data)
177
+
178
+ # Make prediction
179
+ prediction = model.predict(features_df)[0]
180
+ probabilities = model.predict_proba(features_df)[0]
181
+
182
+ # Display results
183
+ st.subheader("Analysis Results")
184
+
185
+ # Create metrics row
186
+ metrics_cols = st.columns(2)
187
+ with metrics_cols[0]:
188
+ st.metric("Readability Level", f"Level {prediction}")
189
+ with metrics_cols[1]:
190
+ highest_prob = max(probabilities)
191
+ st.metric("Confidence", f"{highest_prob:.2%}")
192
+
193
+ # Show probability distribution
194
+ st.plotly_chart(plot_probabilities(probabilities),
195
+ use_container_width=True)
196
+
197
+ # Show all feature values including derived features
198
+ st.subheader("All Features (Including Derived)")
199
+ st.plotly_chart(plot_feature_values(features_df),
200
+ use_container_width=True)
201
+
202
+ except Exception as e:
203
+ st.error(f"Error during analysis: {str(e)}")
204
+
205
+ with col2:
206
+ # Information sidebar
207
+ with st.container():
208
+ st.subheader("About Readability Levels")
209
+ st.write("""
210
+ The model predicts readability on a scale from 1 to 6:
211
+ - **Level 1**: Very Easy
212
+ - **Level 2**: Easy
213
+ - **Level 3**: Moderately Easy
214
+ - **Level 4**: Moderate
215
+ - **Level 5**: Moderately Difficult
216
+ - **Level 6**: Difficult
217
+ """)
218
+
219
+ st.subheader("Feature Explanations")
220
+ st.write("""
221
+ **Basic Features:**
222
+ - Character counts (original and tokenized)
223
+ - Word and token counts
224
+ - Type-token ratio (vocabulary diversity)
225
+ - Token length statistics
226
+
227
+ **Derived Features:**
228
+ - Characters per word/token
229
+ - Token complexity
230
+ - Lexical density
231
+ - Overall complexity score
232
+ """)
233
+
234
+ st.subheader("Model Performance")
235
+ st.write("""
236
+ This model achieves:
237
+ - **Accuracy**: 73.86%
238
+ - **Macro Avg F1**: 0.75
239
+ - **Weighted Avg F1**: 0.74
240
+
241
+ *Note: Results should be used as guidance rather than absolute measures.*
242
+ """)
243
+
244
+ if __name__ == "__main__":
245
+ main()
model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5160f5caf5965a9cc267b61c8ae47b629a7f178172ad80daf1c16201a03b1a93
3
+ size 1069137279
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ pandas
2
+ numpy
3
+ joblib
4
+ plotly
5
+ scikit-learn
6
+ xgboost