# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_excel('Employees.xlsx')
data.shape
data
The data contains 14999 employees and 10 features. The “left” column is the target variable, 1 for employees who left the company and 0 for those who didn't.
Observing "department" column, I found some redudant department names : support, technical, IT. I assume they refer to the same department, so let's combine them all as 'IT'.
data['department']=np.where(data['department'] =='support', 'IT', data['department'])
data['department']=np.where(data['department'] =='technical', 'IT', data['department'])
data['department'].unique()
Before finding correlation value, we need to quantify the qualitative features: "salary" and "department". Convert "salary" column into ordinal integer value and "department" column into dummy variables.
# Map salary into integers
salary_map = {"low": 0, "medium": 1, "high": 2}
data['salary'] = data['salary'].map(salary_map)
# Create dummy variables for department feature
data = pd.get_dummies(data, columns=["department"])
data.head()
data.shape
data.columns.tolist()
# create a correlation heatmap of all features in relation to left
fig, ax = plt.subplots(figsize = (2, 15))
cmap = sns.diverging_palette(130, 275, as_cmap=True)
sns.heatmap(data[data.columns[1:]].corr()[['left']], annot=True, linewidths=.4, fmt=".2f", cmap=cmap, ax=ax);
#sns.set(font_scale=4)
#plt.savefig("heatmap_fe_4.png")
Values closer to zero means there is no linear trend between the two variables. The close to 1 the correlation is the more positively correlated they are; that is as one increases so does the other and the closer to 1 the stronger this relationship is. A correlation closer to -1 is similar, but instead of both increasing one variable will decrease as the other increases.
# check how many employess left
data.left.value_counts()
Here, you can see out of 15,000 approx 3,571 were left, and 11,428 stayed.
# find characteristics of employees who are left
left = data.groupby('left')
left.mean()
Here you can see, Employees who left the company had low satisfaction level, low promotion rate, low salary, and worked more compare to who stayed in the company.
# Visualize characteristics of employees who are left in all features
features=['number_project','time_spend_company','Work_accident','left', 'promotion_last_5years', 'salary']
fig=plt.subplots(figsize=(10,15))
for i, j in enumerate(features):
plt.subplot(3, 2, i+1)
plt.subplots_adjust(hspace = 0.5)
sns.countplot(x=j,data = data, hue='left', palette="husl")
plt.xticks(rotation=90)
plt.title("No. of employee")
Based on histograms above, we can see :
# get only data department
data.iloc[:, 9:18]
# concat data departement with left features
data_department = pd.concat([data.iloc[:, 9:18], data['left']], axis=1)
data_department
# total employees per department
total_employees = data_department.iloc[:,0:8].sum().tolist()
total_employees
# sum of employees leave and stay
groupped_data = data_department.groupby('left').sum()
groupped_data
# percentage of employees leaving per deaprtmenet respectively
list_percentage = list()
for i, j in enumerate (groupped_data.columns):
if groupped_data.index.get_level_values(0).values.tolist()[1]:
left_employees = groupped_data.iat[1,i]
percentage = 100*left_employees/total_employees[i]
list_percentage.append([j,percentage])
list_percentage
# Create DataFrame Percentage
df_percentage = pd.DataFrame(list_percentage, columns = ['Department', 'Percentage'])
df_percentage
X = data.loc[:, data.columns != "left"]
y = data['left']
# Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
Evaluate each method using 10 fold cross validation (CV) and f1 score then choose the one with the highest CV f1-score.
from sklearn import tree
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=10, random_state=7)
treeCV = tree.DecisionTreeClassifier()
treeCV.fit(X_train, y_train)
scoring = 'f1'
results = model_selection.cross_val_score(treeCV, X_train, y_train, cv=kfold, scoring=scoring)
print("Decision tree average f1 score: %.3f" % (results.mean()))
from sklearn.metrics import classification_report
print(classification_report(y_test, treeCV.predict(X_test)))
from sklearn.ensemble import RandomForestClassifier
rfCV = RandomForestClassifier()
rfCV.fit(X_train, y_train)
scoring = 'f1'
results = model_selection.cross_val_score(rfCV, X_train, y_train, cv=kfold, scoring=scoring)
print("Random forest average f1 score: %.3f" % (results.mean()))
print(classification_report(y_test, rfCV.predict(X_test)))
from sklearn.naive_bayes import GaussianNB
naiveCV = GaussianNB()
naiveCV.fit(X_train, y_train)
scoring = 'f1'
results = model_selection.cross_val_score(naiveCV, X_train, y_train, cv=kfold, scoring=scoring)
print("Naive Bayes average f1 score: %.3f" % (results.mean()))
print(classification_report(y_test, naiveCV.predict(X_test)))
from sklearn.svm import SVC
svmCV = SVC()
svmCV.fit(X_train, y_train)
scoring = 'f1'
results = model_selection.cross_val_score(svmCV, X_train, y_train, cv=kfold, scoring=scoring)
print("Support vector machine average f1 score: %.3f" % (results.mean()))
print(classification_report(y_test, svmCV.predict(X_test)))
Random Forest give the most highest f1-score.
# important features
features = list(data.columns.values.tolist())
features.remove('left')
feature_labels=np.array(features)
importance = rfCV.feature_importances_
feature_indexes_by_importance = np.flip(importance.argsort())
for index in feature_indexes_by_importance:
print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))
Based on Random Forest model, five most important features are: