File size: 1,920 Bytes

a4c05e4

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import save_model
from joblib import dump  # To save the TF-IDF vectorizer

# 1. Read Data
data = pd.read_excel('gender.xlsx')

# 2. Preprocess Data
data['Gender'] = data['Gender'].map({'M': 1, 'F': 0})

# 3. Convert text data into numerical data using TF-IDF
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(1, 3))
X = tfidf.fit_transform(data['Name']).toarray()  # Convert names into numerical features
y = data['Gender'].values  # Labels: 1 for Male, 0 for Female

# 4. Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Build the Neural Network Model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))  # Add dropout to prevent overfitting
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid for binary classification

# 6. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 7. Train the model with epochs
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# 8. Save the model after training
model.save('gender_prediction_model.h5')

# 9. Save the TF-IDF vectorizer
dump(tfidf, 'tfidf_vectorizer.joblib')

# 10. Evaluate the model
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary output
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")