suicideDetector / main.py
AndyJamesTurner's picture
Python version 3.10
8b59fc6 verified
import os
import sklearn
import xgboost
import dill as pickle
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, accuracy_score, roc_auc_score, RocCurveDisplay, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
from skops import hub_utils
from skops import card
# load in in the Suicide Detection dataset
# accessible at https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch
df = pd.read_csv(
"Suicide_Detection.csv",
usecols=["text", "class"],
dtype= {"text":str,"class":str}
)
# separate text and target class
X = df['text'].to_list()
y = df['class'].apply(lambda x: 1 if x == 'suicide' else 0).to_list()
# construct training and testing splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
def preprocessor(s):
"""preprocessor for the tfidf vectorizer"""
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stopwords_set = set(ENGLISH_STOP_WORDS)
def filter(text):
if text == None:
return ""
words = str(text).split()
filtered_words = [word for word in words if word and word.lower() not in stopwords_set]
return " ".join(filtered_words)
return filter(s)
# construct the model pipeline
model = Pipeline([
('tfidf', TfidfVectorizer(preprocessor=preprocessor, ngram_range=(1, 3), min_df=100)),
('classifier', xgboost.XGBClassifier())
], verbose=True)
# fit the model using the training split
model.fit(X_train, y_train)
# use the trained model to make predictions on the testing set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
# save the trained model
model_filename = "model.pkl"
with open(model_filename, mode="bw") as f:
pickle.dump(model, file=f)
local_repo = Path("suicide-detector")
# construct the hugging face page
hub_utils.init(
model=model_filename,
requirements=[f"scikit-learn={sklearn.__version__}", f"xgboost={xgboost.__version__}"],
dst=str(local_repo),
task="text-classification",
data=X_test,
)
# made a header card from the metadata
model_card = card.Card(model, metadata=card.metadata_from_config(local_repo))
# add license
model_card.metadata.license = "mit"
model_description = """
Suicide Detection text classification model.
PYTHON 3.10 ONLY
"""
model_card.add(**{"Model description": model_description})
model_card.delete("Model description/Intended uses & limitations")
model_card.delete("Model Card Contact")
model_card.delete("Citation")
# model_card.delete("Evaluation Results")
model_card.delete("Model Card Authors")
training_procedure = """
Trained using 0.7 of the the Suicide and Depression Detection dataset (https://www.kaggle.com/datasets/nikhileswarkomati/suicide-watch)
The model vectorises each text using a trained tfidf vectorizer and then classifies using xgboost.
See main.py for further details.
"""
model_card.add(**{"Model description/Training Procedure": training_procedure})
# add description of how the model was evaluated
eval_descr = (
"The model was evaluated on a 0.3 holdout split using f1 score, accuracy, confusion matrix and ROC curves."
)
model_card.add(**{"Model Evaluation": eval_descr})
# compute model evaluation metrics and add details to the hugging face model card
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="micro")
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot()
disp.figure_.savefig(local_repo / "confusion_matrix.png")
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
disp = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
disp.plot()
disp.figure_.savefig(local_repo / "roc_curve.png")
clf_report = classification_report(
y_test, y_pred, output_dict=True, target_names=["not suicide", "suicide"]
)
model_card.add_metrics(**{"accuracy": accuracy, "f1 score": f1,"ROC AUC": roc_auc})
model_card.add_plot(**{"Model Evaluation/Confusion matrix": "confusion_matrix.png"})
model_card.add_plot(**{"Model Evaluation/ROC Curve": "roc_curve.png"})
clf_report = pd.DataFrame(clf_report).T.reset_index()
model_card.add_table(
**{
"Classification Report": clf_report,
},
)
get_started_code = """
```python
import sklearn
import dill as pickle
from skops import hub_utils
from pathlib import Path
suicide_detector_repo = Path("./suicide-detector")
hub_utils.download(
repo_id="AndyJamesTurner/suicideDetector",
dst=suicide_detector_repo
)
with open(suicide_detector_repo / "model.pkl", 'rb') as file:
clf = pickle.load(file)
classification = clf.predict(["I want to kill myself"])[0]
```
"""
authors = """
This model was created by the following authors:
* Andy Turner
"""
# add additional details to the page including
# model description, getting started guide, and author
model_card.add(**{
"How to Get Started with the Model": get_started_code,
"Model Authors": authors
}
)
# construct a readme from the model card
model_card.save(local_repo / "README.md")
# add this file to the repo to document how it was constructed
hub_utils.add_files(
os.path.realpath(__file__),
dst=str(local_repo),
)