%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import csv
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

path = #INSERT DATA PATH TO model_features_solo_10.csv HERE

df = pd.read_csv(path+'model_features_solo_10.csv')
cols = ['kills','deaths','assists','KDA','cum_kills','cum_assists','cum_deaths','cum_KDA','mean_kills','mean_assists','mean_deaths','mean_KDA','quit','player_id','experience','winner','performance','performance_session','match','session','match_duration','cum_match_duration','mean_match_duration']
df.columns = cols
player_stats = df.groupby('player_id')[['winner']].agg( ['mean','count','sum'] )
player_stats = player_stats.sort_values(by = [('winner','mean')], ascending = False)
print 'number of users:',
print len(player_stats.index)

# feature engineering
df['log_cum_match_duration'] = np.log(df['cum_match_duration'])
df['winner'] = df['winner'].astype(int)
df['kills'] = df['kills'].astype(int)
df['assists'] = df['assists'].astype(int)
df['deaths'] = df['deaths'].astype(int)
df['match'] = df['match'].astype(int)
df['session'] = df['session'].astype(int)
df['experience'] = df['experience'].astype(int)
df['match_duration'] = df['match_duration'].astype(float)

mask = np.random.rand(len(df)) < 0.9
train = df[mask]
test = df[~mask]

df_drop = df[df['quit'] == 1]
df_nodrop = df[df['quit'] == 0]
df_nodrop2 = df_nodrop.sample(n = len(df_drop.index)).copy()
df_nodrop2 = df[df['quit'] == 0].sample(n = len(df_drop.index)).copy()
df_balanced = pd.concat([df_drop,df_nodrop2])
mask = np.random.rand(len(df_balanced)) < 0.9
train_balanced = df_balanced[mask]
test_balanced = df_balanced[~mask]

def performance(prediction_values, test_values):
    tp = 0 # true positives
    fn = 0 # false negatives
    fp = 0 # false positives
    tn = 0 # true negatives
    
    tpr = []
    fpr = []
    
    for i,j in zip(prediction_values, test_values):
        if (i == True) and (j == True):
            tp += 1.0
        if (i == False) and (j == True):
            fn += 1.0
        if (i == True) and (j == False):
            fp += 1.0  
        if (i == False) and (j == False):
            tn += 1.0
    
    results = dict()
    
    results['precision'] = tp / (tp + fp)
    results['recall'] = tp / (tp + fn)
    results['accuracy'] = (tp + tn) / (tp + tn + fp + fn)
    
    results['F1_score'] = 2 * (results['precision'] * results['recall']) / (results['precision'] + results['recall'])
    results['nobs'] = len(prediction_values)

    return results

number of users: 5046

def feature_add(feature_list,num):
    features = feature_list
    n_estimators = 512

    # Random Forest Classifier
    print 'Random Forest Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_rf = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]

        clf = RandomForestClassifier(criterion='entropy', n_estimators=n_estimators, n_jobs=-1)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class
        fpr_rf, tpr_rf, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_rf = auc(fpr_rf, tpr_rf)
        cv_roc_auc_rf.append(roc_auc_rf) 

    roc_auc_rf = np.mean(cv_roc_auc_rf)
    roc_auc_rf_sd = np.std(cv_roc_auc_rf)

    print 'roc_auc_rf:',roc_auc_rf
    print 'roc_auc_rf_sd:',roc_auc_rf_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_rf, tpr_rf, color='#30a2da',
             lw=lw, label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

    from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                  max_depth=None, max_features=None, max_leaf_nodes=None,
                  min_impurity_decrease=1e-07,
                  n_estimators=n_estimators, n_jobs=-1, oob_score=False, random_state=None,
                  verbose=0, warm_start=False)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print 'cv_f1_mean:',np.mean(cv_f1)
    print 'cv_f1_mean:',np.std(cv_f1)
    print
    print 'cv_precision_mean:',np.mean(cv_precision)
    print 'cv_precision_mean:',np.std(cv_precision)
    print
    print 'cv_recall_mean:',np.mean(cv_recall)
    print 'cv_recall_mean:',np.std(cv_recall)
    print
    print 'cv_accuracy_mean:',np.mean(cv_accuracy)
    print 'cv_accuracy_mean:',np.std(cv_accuracy)
    print

    rf_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    rf_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # feature importance

    print 'Random Forest classifier feature importance:'
    for feature,value in zip(features, clf.feature_importances_):
        print feature, value


    # Gradient Boosting Classifier 
    print 'Gradient Boosting Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_gb = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]

        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class

        fpr_gb, tpr_gb, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_gb = auc(fpr_gb, tpr_gb)
        cv_roc_auc_gb.append(roc_auc_gb) 

        roc_auc_gb = np.mean(cv_roc_auc_gb)
        roc_auc_gb_sd = np.std(cv_roc_auc_gb)

    print roc_auc_gb
    print roc_auc_gb_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
             lw=lw, label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()


    # feature importance
    for feature,value in zip(features, clf.feature_importances_):
        print feature, value

        from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = GradientBoostingClassifier(n_estimators=n_estimators)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print np.mean(cv_f1)
    print np.std(cv_f1)
    print
    print np.mean(cv_precision)
    print np.std(cv_precision)
    print
    print np.mean(cv_recall)
    print np.std(cv_recall)
    print
    print np.mean(cv_accuracy)
    print np.std(cv_accuracy)
    print

    gb_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    gb_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # Adaptive Boosting Classifier 
    print 'Adaptive Boosting Classifier'

    from sklearn import metrics
    from sklearn.metrics import roc_curve, auc, roc_auc_score

    cv_roc_auc_ab = []
    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df)) < 0.9
        train = df[mask]
        test = df[~mask]


        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf.fit(train[features], train['quit'])

        # Compute ROC curve and ROC area for each class

        fpr_ab, tpr_ab, _ = roc_curve(np.array(test['quit'].astype(int)), clf.predict_proba(test[features])[:, 1])
        roc_auc_ab = auc(fpr_ab, tpr_ab)
        cv_roc_auc_ab.append(roc_auc_ab) 

        roc_auc_ab = np.mean(cv_roc_auc_ab)
        roc_auc_ab_sd = np.std(cv_roc_auc_ab)

    print roc_auc_ab
    print roc_auc_ab_sd

    plt.figure()
    lw = 2
    plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
             lw=lw, label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right')
    plt.show()

    from sklearn import metrics
    cv_f1 = []
    cv_precision = []
    cv_recall = []
    cv_accuracy = []

    for i in range(10):
        print (i+1)
        mask = np.random.rand(len(df_balanced)) < 0.9
        train_balanced = df_balanced[mask]
        test_balanced = df_balanced[~mask]

        clf = AdaBoostClassifier(n_estimators=n_estimators)
        clf.fit(train_balanced[features], train_balanced['quit'])

        res = performance(clf.predict(test_balanced[features]), test_balanced['quit'])
        cv_f1.append(res['F1_score'])
        cv_precision.append(res['precision'])
        cv_recall.append(res['recall'])
        cv_accuracy.append(res['accuracy'])

    print np.mean(cv_f1)
    print np.std(cv_f1)
    print
    print np.mean(cv_precision)
    print np.std(cv_precision)
    print
    print np.mean(cv_recall)
    print np.std(cv_recall)
    print
    print np.mean(cv_accuracy)
    print np.std(cv_accuracy)
    print

    ab_scores = [np.mean(cv_f1), np.mean(cv_recall), np.mean(cv_precision), np.mean(cv_accuracy) ]
    ab_scores_std = [np.std(cv_f1), np.std(cv_recall), np.std(cv_precision), np.std(cv_accuracy) ]

    # feature importance
    for feature,value in zip(features, clf.feature_importances_):
        print 'feature importance',feature, value

    plt.figure(figsize = (5,5))
    lw = 2

    plt.plot(fpr_rf, tpr_rf, color='#30a2da',
             lw=lw, ls = 'solid', label='Random Forest (AUC = %0.2f)' % roc_auc_rf)
    plt.plot(fpr_gb, tpr_gb, color='#fc4f30',
             lw=lw, ls = 'dashed', label='Gradient Boosting (AUC = %0.2f)' % roc_auc_gb)
    plt.plot(fpr_ab, tpr_ab, color='#7A68A6',
             lw=lw, ls = 'dotted', label='Adaptive Boosting (AUC = %0.2f)' % roc_auc_ab)
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='dashed', label='Random Guessing (AUC = 0.50)')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate', fontsize = 14)
    plt.ylabel('True Positive Rate', fontsize = 14)
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='lower right', fontsize = 10)
    plt.savefig('roc_auc10_model'+num+'.png',dpi=300)
    plt.show()


    N = 4

    ind = np.arange(N)  # the x locations for the groups
    width = 0.15       # the width of the bars

    plt.figure(figsize = (10,5))
    fig, ax = plt.subplots()
    ax.set_position([0.1,0.1,.75,0.65])
    rects1 = ax.bar(ind + width, ab_scores, width, yerr = ab_scores_std, color='#7A68A6', hatch = '/', edgecolor = 'black', ecolor = 'black')
    rects2 = ax.bar(ind - width, rf_scores, width, yerr = rf_scores_std,color='#30a2da', hatch= '\\', edgecolor = 'black',ecolor = 'black')
    rects3 = ax.bar(ind, gb_scores, width, yerr = gb_scores_std,color='#fc4f30', edgecolor = 'black',ecolor = 'black')

    # add some text for labels, title and axes ticks
    ax.set_ylabel('Scores', fontsize = 14)
    ax.set_title('')
    ax.set_ylim(0,1)
    ax.set_xticks(ind + width / 2)
    ax.set_xticklabels(('F1-measure','Recall','Precision','Accuracy'), fontsize = 14)

    ax.legend(( rects2[0], rects3[0], rects1[0]), ('Random\nForest','Gradient\nBoosting', 'Adaptive\nBoosting'), ncol = 3, loc='upper center', 
              bbox_to_anchor=(0.5, 1.25))
    plt.savefig('metrics10_model'+num+'png',dpi=300)

# KDA
# session cumulated KDA
# session mean KDA
# win
# session accumulated win count
# session mean win count
['KDA','cum_KDA','mean_KDA','winner','performance_session','performance']

feature_list = ['match','match_duration','cum_match_duration','mean_match_duration','session',
                'player_id','experience','kills','deaths','assists','cum_kills','cum_deaths',
                'cum_assists','mean_kills','mean_deaths','mean_assists','KDA','cum_KDA','mean_KDA',
                'winner','performance_session','performance']
num = '3'
feature_add(feature_list,num)

Random Forest Classifier
1
2
3
4
5
6
7
8
9
10
roc_auc_rf: 0.967610164302
roc_auc_rf_sd: 0.00080056131198

1
2
3
4
5
6
7
8
9
10
cv_f1_mean: 0.96192948678
cv_f1_mean: 0.000905076455103

cv_precision_mean: 0.927099038915
cv_precision_mean: 0.00171305742291

cv_recall_mean: 0.999481180845
cv_recall_mean: 0.000363561091863

cv_accuracy_mean: 0.960167434991
cv_accuracy_mean: 0.00103151023027

Random Forest classifier feature importance:
match 0.364139357621
match_duration 0.0196659612059
cum_match_duration 0.0124412582408
mean_match_duration 0.0135702945199
session 0.0120793719922
player_id 0.0178793669944
experience 0.0156913846901
kills 0.00828197467404
deaths 0.00662120898295
assists 0.00915843263576
cum_kills 0.00705378823822
cum_deaths 0.00562389914064
cum_assists 0.00717261455417
mean_kills 0.00753630661204
mean_deaths 0.00619852024605
mean_assists 0.0077481553964
KDA 0.0132243947198
cum_KDA 0.00964014873491
mean_KDA 0.0096469445208
winner 0.000397930578077
performance_session 0.110853166721
performance 0.335375518981
Gradient Boosting Classifier
1
2
3
4
5
6
7
8
9
10
0.976105808922
0.000632452705453

match 0.0870584562921
match_duration 0.0355248911613
cum_match_duration 0.0716129472629
mean_match_duration 0.0220520968507
session 0.0510534813092
player_id 0.0290648438088
experience 0.0577809915164
kills 0.00764355361192
deaths 0.0102722975876
assists 0.00755127810758
cum_kills 0.0205104590356
cum_deaths 0.0172562058809
cum_assists 0.0214787797015
mean_kills 0.00827869661069
mean_deaths 0.00724555144333
mean_assists 0.0143933109629
KDA 0.0127503100578
cum_KDA 0.0107731195658
mean_KDA 0.010402253529
winner 0.00262566135257
performance_session 0.193573010813
performance 0.301097803539
1
2
3
4
5
6
7
8
9
10
0.959100992643
0.00107256675888

0.921840311799
0.00207869096781

0.999503694893
0.000315524716747

0.957269052125
0.00115291972592

Adaptive Boosting Classifier
1
2
3
4
5
6
7
8
9
10
0.914952966041
0.00200901487973

1
2
3
4
5
6
7
8
9
10
0.887915217765
0.00317391357803

0.824244828788
0.00406062631042

0.962255467737
0.00295159997639

0.878307078039
0.00325184582501

feature importance match 0.015625
feature importance match_duration 0.03515625
feature importance cum_match_duration 0.01953125
feature importance mean_match_duration 0.009765625
feature importance session 0.12890625
feature importance player_id 0.017578125
feature importance experience 0.134765625
feature importance kills 0.0
feature importance deaths 0.0078125
feature importance assists 0.005859375
feature importance cum_kills 0.00390625
feature importance cum_deaths 0.00390625
feature importance cum_assists 0.00390625
feature importance mean_kills 0.001953125
feature importance mean_deaths 0.0078125
feature importance mean_assists 0.005859375
feature importance KDA 0.01171875
feature importance cum_KDA 0.00390625
feature importance mean_KDA 0.00390625
feature importance winner 0.001953125
feature importance performance_session 0.3671875
feature importance performance 0.208984375

<matplotlib.figure.Figure at 0x7fa231d7b7d0>