def feature_add(feature_list,num):
features = feature_list
n_estimators = 512
# Random Forest Classifier
print 'Random Forest Classifier'
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
cv_roc_auc_rf = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
clf = RandomForestClassifier(criterion='entropy', n_estimators=n_estimators, n_jobs=-1)
clf.fit(train[features], train['quit'])
# Compute ROC curve and ROC area for each class
fpr_rf, tpr_rf, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
cv_roc_auc_rf.append(roc_auc_rf)
roc_auc_rf = np.mean(cv_roc_auc_rf)
roc_auc_rf_sd = np.std(cv_roc_auc_rf)
print 'roc_auc_rf:',roc_auc_rf
print 'roc_auc_rf_sd:',roc_auc_rf_sd
plt.figure()
lw = 2
plt.plot(fpr_rf, tpr_rf, color='#30a2da',
lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=1e-07,
n_estimators=n_estimators, n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
clf.fit(train_balanced[features], train_balanced['quit'])
res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
cv_f1.append(res['F1_score'])
cv_precision.append(res['precision'])
cv_recall.append(res['recall'])
cv_accuracy.append(res['accuracy'])
print 'cv_f1_mean:',np.mean(cv_f1)
print 'cv_f1_mean:',np.std(cv_f1)
print
print 'cv_precision_mean:',np.mean(cv_precision)
print 'cv_precision_mean:',np.std(cv_precision)
print
print 'cv_recall_mean:',np.mean(cv_recall)
print 'cv_recall_mean:',np.std(cv_recall)
print
print 'cv_accuracy_mean:',np.mean(cv_accuracy)
print 'cv_accuracy_mean:',np.std(cv_accuracy)
print
rf_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
rf_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]
# feature importance
print 'Random Forest classifier feature importance:'
for feature,value in zip(features, clf.feature_importances_):
print feature, value
# Gradient Boosting Classifier
print 'Gradient Boosting Classifier'
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
cv_roc_auc_gb = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
clf = GradientBoostingClassifier(n_estimators=n_estimators)
clf.fit(train[features], train['quit'])
# Compute ROC curve and ROC area for each class
fpr_gb, tpr_gb, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
roc_auc_gb = auc(fpr_gb, tpr_gb)
cv_roc_auc_gb.append(roc_auc_gb)
roc_auc_gb = np.mean(cv_roc_auc_gb)
roc_auc_gb_sd = np.std(cv_roc_auc_gb)
print roc_auc_gb
print roc_auc_gb_sd
plt.figure()
lw = 2
plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
lw=lw, label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
# feature importance
for feature,value in zip(features, clf.feature_importances_):
print feature, value
from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
clf = GradientBoostingClassifier(n_estimators=n_estimators)
clf.fit(train_balanced[features], train_balanced['quit'])
res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
cv_f1.append(res['F1_score'])
cv_precision.append(res['precision'])
cv_recall.append(res['recall'])
cv_accuracy.append(res['accuracy'])
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print
gb_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
gb_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]
# Adaptive Boosting Classifier
print 'Adaptive Boosting Classifier'
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
cv_roc_auc_ab = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
clf = AdaBoostClassifier(n_estimators=n_estimators)
clf.fit(train[features], train['quit'])
# Compute ROC curve and ROC area for each class
fpr_ab, tpr_ab, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
roc_auc_ab = auc(fpr_ab, tpr_ab)
cv_roc_auc_ab.append(roc_auc_ab)
roc_auc_ab = np.mean(cv_roc_auc_ab)
roc_auc_ab_sd = np.std(cv_roc_auc_ab)
print roc_auc_ab
print roc_auc_ab_sd
plt.figure()
lw = 2
plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
lw=lw, label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
clf = AdaBoostClassifier(n_estimators=n_estimators)
clf.fit(train_balanced[features], train_balanced['quit'])
res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
cv_f1.append(res['F1_score'])
cv_precision.append(res['precision'])
cv_recall.append(res['recall'])
cv_accuracy.append(res['accuracy'])
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print
ab_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
ab_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]
# feature importance
for feature,value in zip(features, clf.feature_importances_):
print 'feature importance',feature, value
plt.figure(figsize = (5,5))
lw = 2
plt.plot(fpr_rf, tpr_rf, color='#30a2da',
lw=lw, ls = 'solid', label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
lw=lw, ls = 'dashed', label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
lw=lw, ls = 'dotted', label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed', label='Random Guessing (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right', fontsize = 10)
plt.savefig('roc_auc10_model'+num+'.png',dpi=300)
plt.show()
N = 4
ind = np.arange(N) # the x locations for the groups
width = 0.15 # the width of the bars
plt.figure(figsize = (10,5))
fig, ax = plt.subplots()
ax.set_position([0.1,0.1,.75,0.65])
rects1 = ax.bar(ind + width, ab_scores, width, yerr = ab_scores_std, color='#7A68A6', hatch = '/', edgecolor = 'black', ecolor = 'black')
rects2 = ax.bar(ind - width, rf_scores, width, yerr = rf_scores_std,color='#30a2da', hatch= '\\', edgecolor = 'black',ecolor = 'black')
rects3 = ax.bar(ind, gb_scores, width, yerr = gb_scores_std,color='#fc4f30', edgecolor = 'black',ecolor = 'black')
# add some text for labels, title and axes ticks
ax.set_ylabel('Scores', fontsize = 14)
ax.set_title('')
ax.set_ylim(0,1)
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('F1-measure','Recall','Precision','Accuracy'), fontsize = 14)
ax.legend(( rects2[0], rects3[0], rects1[0]), ('Random\nForest','Gradient\nBoosting', 'Adaptive\nBoosting'), ncol = 3, loc='upper center',
bbox_to_anchor=(0.5, 1.25))
plt.savefig('metrics10_model'+num+'png',dpi=300)