%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
path = #INSERT DATA PATH TO model_features_solo_10.csv HERE
df = pd.read_csv(path+'model_features_solo_10.csv')
cols = ['kills','deaths','assists','KDA','cum_kills','cum_assists','cum_deaths','cum_KDA','mean_kills','mean_assists','mean_deaths','mean_KDA','quit','player_id','experience','winner','performance','performance_session','match','session','match_duration','cum_match_duration','mean_match_duration']
df.columns = cols
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
print 'number of users:',
print len(player_stats.index)
# feature engineering
df['log_cum_match_duration'] = np.log(df['cum_match_duration'])
df['winner'] = df['winner'].astype(int)
df['kills'] = df['kills'].astype(int)
df['assists'] = df['assists'].astype(int)
df['deaths'] = df['deaths'].astype(int)
df['match'] = df['match'].astype(int)
df['session'] = df['session'].astype(int)
df['experience'] = df['experience'].astype(int)
df['match_duration'] = df['match_duration'].astype(float)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
df_drop = df[df['quit'] == 1]
df_nodrop = df[df['quit'] == 0]
df_nodrop2 = df_nodrop.sample(n = len(df_drop.index)).copy()
df_nodrop2 = df[df['quit'] == 0].sample(n = len(df_drop.index)).copy()
df_balanced = pd.concat([df_drop,df_nodrop2])
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
def performance(prediction_values, test_values):
tp = 0 # true positives
fn = 0 # false negatives
fp = 0 # false positives
tn = 0 # true negatives
tpr = []
fpr = []
for i,j in zip(prediction_values, test_values):
if (i == True) and (j == True):
tp += 1.0
if (i == False) and (j == True):
fn += 1.0
if (i == True) and (j == False):
fp += 1.0
if (i == False) and (j == False):
tn += 1.0
results = dict()
results['precision'] = tp / (tp + fp)
results['recall'] = tp / (tp + fn)
results['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
results['F1_score'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'])
results['nobs'] = len(prediction_values)
return results
def feature_add(feature_list,num):
features = feature_list
n_estimators = 512
# Random Forest Classifier
print 'Random Forest Classifier'
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
cv_roc_auc_rf = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
clf = RandomForestClassifier(criterion='entropy', n_estimators=n_estimators, n_jobs=-1)
clf.fit(train[features], train['quit'])
# Compute ROC curve and ROC area for each class
fpr_rf, tpr_rf, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
roc_auc_rf = auc(fpr_rf, tpr_rf)
cv_roc_auc_rf.append(roc_auc_rf)
roc_auc_rf = np.mean(cv_roc_auc_rf)
roc_auc_rf_sd = np.std(cv_roc_auc_rf)
print 'roc_auc_rf:',roc_auc_rf
print 'roc_auc_rf_sd:',roc_auc_rf_sd
plt.figure()
lw = 2
plt.plot(fpr_rf, tpr_rf, color='#30a2da',
lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
max_depth=None, max_features=None, max_leaf_nodes=None,
min_impurity_decrease=1e-07,
n_estimators=n_estimators, n_jobs=-1, oob_score=False, random_state=None,
verbose=0, warm_start=False)
clf.fit(train_balanced[features], train_balanced['quit'])
res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
cv_f1.append(res['F1_score'])
cv_precision.append(res['precision'])
cv_recall.append(res['recall'])
cv_accuracy.append(res['accuracy'])
print 'cv_f1_mean:',np.mean(cv_f1)
print 'cv_f1_mean:',np.std(cv_f1)
print
print 'cv_precision_mean:',np.mean(cv_precision)
print 'cv_precision_mean:',np.std(cv_precision)
print
print 'cv_recall_mean:',np.mean(cv_recall)
print 'cv_recall_mean:',np.std(cv_recall)
print
print 'cv_accuracy_mean:',np.mean(cv_accuracy)
print 'cv_accuracy_mean:',np.std(cv_accuracy)
print
rf_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
rf_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]
# feature importance
print ''
print 'Random Forest classifier feature importance:'
for feature,value in zip(features, clf.feature_importances_):
print feature, value
# Gradient Boosting Classifier
print 'Gradient Boosting Classifier'
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
cv_roc_auc_gb = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
clf = GradientBoostingClassifier(n_estimators=n_estimators)
clf.fit(train[features], train['quit'])
# Compute ROC curve and ROC area for each class
fpr_gb, tpr_gb, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
roc_auc_gb = auc(fpr_gb, tpr_gb)
cv_roc_auc_gb.append(roc_auc_gb)
roc_auc_gb = np.mean(cv_roc_auc_gb)
roc_auc_gb_sd = np.std(cv_roc_auc_gb)
print roc_auc_gb
print roc_auc_gb_sd
plt.figure()
lw = 2
plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
lw=lw, label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
# feature importance
for feature,value in zip(features, clf.feature_importances_):
print feature, value
from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
clf = GradientBoostingClassifier(n_estimators=n_estimators)
clf.fit(train_balanced[features], train_balanced['quit'])
res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
cv_f1.append(res['F1_score'])
cv_precision.append(res['precision'])
cv_recall.append(res['recall'])
cv_accuracy.append(res['accuracy'])
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print
gb_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
gb_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]
# Adaptive Boosting Classifier
print 'Adaptive Boosting Classifier'
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, roc_auc_score
cv_roc_auc_ab = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]
clf = AdaBoostClassifier(n_estimators=n_estimators)
clf.fit(train[features], train['quit'])
# Compute ROC curve and ROC area for each class
fpr_ab, tpr_ab, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
roc_auc_ab = auc(fpr_ab, tpr_ab)
cv_roc_auc_ab.append(roc_auc_ab)
roc_auc_ab = np.mean(cv_roc_auc_ab)
roc_auc_ab_sd = np.std(cv_roc_auc_ab)
print roc_auc_ab
print roc_auc_ab_sd
plt.figure()
lw = 2
plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
lw=lw, label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
from sklearn import metrics
cv_f1 = []
cv_precision = []
cv_recall = []
cv_accuracy = []
for i in range(10):
print (i+1)
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]
clf = AdaBoostClassifier(n_estimators=n_estimators)
clf.fit(train_balanced[features], train_balanced['quit'])
res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
cv_f1.append(res['F1_score'])
cv_precision.append(res['precision'])
cv_recall.append(res['recall'])
cv_accuracy.append(res['accuracy'])
print np.mean(cv_f1)
print np.std(cv_f1)
print
print np.mean(cv_precision)
print np.std(cv_precision)
print
print np.mean(cv_recall)
print np.std(cv_recall)
print
print np.mean(cv_accuracy)
print np.std(cv_accuracy)
print
ab_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
ab_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]
# feature importance
for feature,value in zip(features, clf.feature_importances_):
print 'feature importance',feature, value
plt.figure(figsize = (5,5))
lw = 2
plt.plot(fpr_rf, tpr_rf, color='#30a2da',
lw=lw, ls = 'solid', label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
lw=lw, ls = 'dashed', label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
lw=lw, ls = 'dotted', label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed', label='Random Guessing (AUC = 0.50)')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize = 14)
plt.ylabel('True Positive Rate', fontsize = 14)
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right', fontsize = 10)
plt.savefig('roc_auc10_model'+num+'.png',dpi=300)
plt.show()
N = 4
ind = np.arange(N) # the x locations for the groups
width = 0.15 # the width of the bars
plt.figure(figsize = (10,5))
fig, ax = plt.subplots()
ax.set_position([0.1,0.1,.75,0.65])
rects1 = ax.bar(ind + width, ab_scores, width, yerr = ab_scores_std, color='#7A68A6', hatch = '/', edgecolor = 'black', ecolor = 'black')
rects2 = ax.bar(ind - width, rf_scores, width, yerr = rf_scores_std,color='#30a2da', hatch= '\\', edgecolor = 'black',ecolor = 'black')
rects3 = ax.bar(ind, gb_scores, width, yerr = gb_scores_std,color='#fc4f30', edgecolor = 'black',ecolor = 'black')
# add some text for labels, title and axes ticks
ax.set_ylabel('Scores', fontsize = 14)
ax.set_title('')
ax.set_ylim(0,1)
ax.set_xticks(ind + width / 2)
ax.set_xticklabels(('F1-measure','Recall','Precision','Accuracy'), fontsize = 14)
ax.legend(( rects2[0], rects3[0], rects1[0]), ('Random\nForest','Gradient\nBoosting', 'Adaptive\nBoosting'), ncol = 3, loc='upper center',
bbox_to_anchor=(0.5, 1.25))
plt.savefig('metrics10_model'+num+'png',dpi=300)
kills deaths assists session cumulated kills session cumulated deaths session cumulated assists session mean kills session mean deaths session mean assists
feature_list = ['match','match_duration','cum_match_duration','mean_match_duration','session',
'player_id','experience']
num = '1'
feature_add(feature_list,num)