%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import datetime
import csv
import os
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')
import seaborn as sns
import itertools
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
from collections import defaultdict, Counter
from pandas.core import datetools
path = '/Users/annas/Dropbox/Postdoc/Projects/mitigate/YFA_Mitigate/RoyalSociety_Data/Data/'#INSERT DATA PATH HERE
df = pd.DataFrame()
df_reader = pd.read_csv(path+'structured_data_queue.csv', header = None, low_memory = False, chunksize = 100*1024)
for chunk in df_reader:
df = pd.concat([df,chunk])
cols = ['queueType','player_id', 'match_id', 'match_datetime', 'match_creation','match_duration','map_id',
'champion_id','team_id','assists','deaths','champ_level','winner','combat_player_score','double_kills',
'gold_earned','gold_spent','inhibitor_kills','killing_sprees','kills','largest_critical_strike','largest_killing_spree',
'largest_multi_kill','magic_damage_dealt','magic_damage_dealt_to_champions','magic_damage_taken','minions_killed',
'neutral_minions_killed','neutral_minions_killed_enemy_jungle','neutral_minions_killed_team_jungle',
'objective_player_score','penta_kills','physical_damage_dealt','physical_damage_dealt_to_champions','physical_damage_taken',
'quadra_kills','total_damage_dealt','total_damage_dealt_to_champions','total_damage_taken','total_heal',
'total_player_score','total_score_rank','total_time_crowd_control_dealt','total_units_healed','triple_kills',
'true_damage_dealt','true_damage_dealt_to_champions','true_damage_taken','unreal_kills','vision_wards_bought_in_game',
'wards_killed','wards_placed']
df.columns = cols
df = df.dropna(axis=0, how='any')
print 'number of matches:',len(df.groupby(['match_id'])['match_id'].nunique().index)
df['KDA'] = (df['assists'] + df['kills'])/(df['deaths']+1)
# select only solo queue matches
df_solo = df[df['queueType']=='RANKED_SOLO_5x5']
print 'number of solo-queue matches:',len(df_solo.groupby(['match_id'])['match_id'].nunique().index)
#setting individual players match threshold
threshold = 10 # minimum number of matches per user
players = df_solo['player_id'].value_counts()
print 'number of threshold selected palyers:', len(players[players >= threshold])
players = pd.DataFrame(players[players >= threshold].index, columns = ['id'])
df_s10 = df_solo[df_solo['player_id'].isin(players['id'])]
print 'number of solo-queue matches after putting threshold on palyers:',
print(len(df_s10.groupby(['match_id'])['match_id'].nunique().index))
player_stats = df_s10.groupby('player_id').agg( ['mean','count','sum'] )
import matplotlib.gridspec as gridspec
fig = plt.figure(figsize = (18,5))
gs = gridspec.GridSpec(1, 3)
fig.subplots_adjust(wspace=.25)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
font_size = 16
data = np.array(player_stats[('winner','mean')].astype(float))
# ax1
ax1.scatter(player_stats[('winner','count')], player_stats[('winner','mean')], marker = 'o', alpha = .25, color = '#30a2da')
ax1.set_xlabel('matches', fontsize=font_size)
ax1.set_ylabel('winner', fontsize=font_size)
ax1.set_ylim(0,1)
ax1.set_xscale('log', basex=2)
ax1.axhline(y = .5, xmin=0, xmax=2500, linewidth=2, color = 'k', ls = 'dashed')
#ax2
ax2.hist(data, bins = 30, histtype = 'stepfilled', alpha = .75, color = '#30a2da')
ax2.set_xlabel('winner', fontsize=font_size)
ax2.set_ylabel('counts', fontsize=font_size)
ax2.set_xlim(0,1)
ax2.axvline(x = .5, ymin=0, ymax=300, linewidth=2, color = 'k', ls = 'dashed')
#plt.savefig('winner_scatterplot.png', dpi=300)
print(np.mean(data))
print(np.std(data))
cor = np.corrcoef(player_stats[('winner','count')], player_stats[('winner','mean')])
print(cor)
fig = plt.figure(figsize = (18,5))
gs = gridspec.GridSpec(1, 3)
fig.subplots_adjust(wspace=.25)
ax1 = fig.add_subplot(gs[0])
ax2 = fig.add_subplot(gs[1])
font_size = 16
data = np.array(player_stats[('KDA','mean')].astype(float))
# ax1
ax1.scatter(player_stats[('KDA','count')], player_stats[('KDA','mean')], marker = 'o', alpha = .25, color = '#30a2da')
ax1.set_xlabel('matches', fontsize=font_size)
ax1.set_ylabel('KDA', fontsize=font_size)
ax1.axhline(y = np.mean(data) , xmin=0, xmax=2500, linewidth=2, color = 'k', ls = 'dashed')
#ax2
ax2.hist(data, bins = 30, histtype = 'stepfilled', alpha = .75, color = '#30a2da')
ax2.set_xlabel('KDA', fontsize=font_size)
ax2.set_ylabel('counts', fontsize=font_size)
ax2.axvline(x = np.mean(data), ymin=0, ymax=300, linewidth=2, color = 'k', ls = 'dashed')
#plt.savefig('KDA_scatterplot.png', dpi=300)
print(np.mean(data))
print(np.std(data))
cor = np.corrcoef(player_stats[('KDA','count')], player_stats[('KDA','mean')])
print(cor)
playerid_stats = df_s10.groupby('player_id')[['winner']].agg( ['mean','count','sum'])
playerid_stats = playerid_stats.sort_values(by = [('winner','mean')], ascending = False)
playerid_stats = playerid_stats[playerid_stats[('winner','count')] >= threshold]
inter_arrival_times = []
for player_id in playerid_stats.index:
temp = df_s10[df_s10['player_id'] == player_id]
datetime = pd.to_datetime(temp['match_datetime'])
inter = datetime.diff().astype('timedelta64[s]')[1:] / 3600
duration = temp['match_duration'] / 3600
inter_arrival_times.extend(np.subtract(inter, duration[:-1]))
class Sequence(object):
def __init__(self):
self.player_id = None
self.sequences = list()
# compute original sessions
m = 0.25
sequences = []
for player_id in playerid_stats.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df_s10[df_s10['player_id'] == player_id].reset_index().drop(['index','queueType'], axis=1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
sequences.append(obj)
# compute randomized index sessions
randomized_sequences = []
for player_id in playerid_stats.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df_s10[df_s10['player_id'] == player_id].reset_index().drop(['index','queueType'], axis=1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
df_copy = df_copy.sample(frac=1).reset_index(drop=True)
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
randomized_sequences.append(obj)
### code to generate figures
from collections import OrderedDict
def sequence_analysis(sequence_length, sequences, feature, verbose = True):
data = dict()
for seq in sequences:
for df in seq.sequences:
if len(df.index) == sequence_length:
for match in xrange(1,sequence_length+1):
temp = df[feature].reset_index().drop('index', axis=1)
data.setdefault(str(match), list()).extend( [temp[feature].ix[match-1]] )
stats = dict()
for session in range(1,sequence_length+1):
stats.setdefault(str(session), dict(mu = None, sigma = None, n = None, ci = None))
stats[str(session)]['mu'] = np.mean(data[str(session)])
stats[str(session)]['sigma'] = np.std(data[str(session)])
stats[str(session)]['n'] = len(data[str(session)])
stats[str(session)]['ci'] = 1.96 * np.std(data[str(session)]) / np.sqrt(len(data[str(session)]))
if verbose == True:
print('Session ='),
print(session),
print('| Sequence length ='),
print(sequence_length)
print('mu ='),
print(stats[str(session)]['mu'])
print('sigma ='),
print(stats[str(session)]['sigma'])
print('n obs ='),
print(stats[str(session)]['n'])
print('n ci ='),
print(stats[str(session)]['ci'])
print('-' * 40)
return OrderedDict(sorted(stats.items(), key=lambda t: t[0]))
font_size = 16
feature = 'winner'
ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)
orig.set_xlabel('match position', fontsize=font_size)
orig.set_ylabel('win rate', fontsize=font_size)
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))
rand.set_xlabel('match position', fontsize=font_size)
rand.set_ylabel('win rate', fontsize=font_size)
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))
rand.set_title('Original index sessions', fontsize=font_size)
rand_bar_y = []
bar_x = []
bar_y = []
for i in range(1,6):
stats = sequence_analysis(i, sequences, feature, verbose = True)
rand_stats = sequence_analysis(i, randomized_sequences, feature, verbose = True)
x = range(1,i+1)
y = []
err = []
rand_y = []
rand_err = []
for item in stats:
y.append(stats[item]['mu'])
err.append(stats[item]['ci'])
for item in stats:
rand_y.append(rand_stats[item]['mu'])
rand_err.append(rand_stats[item]['ci'])
xx = [x[0], x[-1]]
yy = [y[0], y[-1]]
eerr = [err[0], err[-1]]
rand_yy = [rand_y[0], rand_y[-1]]
rand_eerr = [rand_err[0], rand_err[-1]]
bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
bar_x.append(x[-1])
rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
orig.errorbar(x, y, yerr = err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 10, capthick = 3, capsize = 6)
rand.errorbar(x, rand_y, yerr = rand_err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 10, capthick = 3, capsize = 6)
rand.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'lower right', shadow = True, fancybox = True)
orig.set_title('original sessions', fontsize = font_size)
rand.set_title('randomized index sessions', fontsize = font_size)
#plt.savefig('deterioration_winner.png',dpi=300)
plt.show()
feature = 'KDA'
ls = ['solid','solid','dashed','dashdot','dotted']
sym = ['o','*','s','v','^']
col = ['#30a2da', '#fc4f30', '#e5ae38', '#6d904f', '#8b8b8b']
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
orig = fig.add_subplot(121)
rand = fig.add_subplot(122, sharey = orig)
orig.set_xlabel('match position', fontsize=font_size)
orig.set_ylabel('KDA', fontsize=font_size)
orig.set_xlim(0.5,5.5)
orig.set_xticks(np.arange(1,6), xrange(1,6))
rand.set_xlabel('match position', fontsize=font_size)
rand.set_ylabel('KDA', fontsize=font_size)
rand.set_xlim(0.5,5.5)
rand.set_xticks(np.arange(1,6), xrange(1,6))
rand_bar_y = []
bar_x = []
bar_y = []
for i in range(1,6):
stats = sequence_analysis(i, sequences, feature, verbose = False)
rand_stats = sequence_analysis(i, randomized_sequences, feature, verbose = False)
x = range(1,i+1)
y = []
err = []
rand_y = []
rand_err = []
for item in stats:
y.append(stats[item]['mu'])
err.append(stats[item]['ci'])
for item in stats:
rand_y.append(rand_stats[item]['mu'])
rand_err.append(rand_stats[item]['ci'])
xx = [x[0], x[-1]]
yy = [y[0], y[-1]]
eerr = [err[0], err[-1]]
rand_yy = [rand_y[0], rand_y[-1]]
rand_eerr = [rand_err[0], rand_err[-1]]
bar_y.append((y[-1] - y[0]) / float(y[0]) * 100)
bar_x.append(x[-1])
rand_bar_y.append( (rand_y[-1] - rand_y[0]) / float(rand_y[0]) * 100 )
orig.errorbar(x, y, yerr = err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 10, capthick = 3, capsize = 6)
rand.errorbar(x, rand_y, yerr = rand_err, fmt = sym[len(y)-1], linestyle = ls[len(y)-1], color = col[len(y)-1],
lw = 3, markersize = 10, capthick = 3, capsize = 6)
rand.legend([str(i) + ' matches' for i in range(1,6)],
loc = 'upper right', shadow = True, fancybox = True)
orig.set_title('original sessions', fontsize=font_size)
rand.set_title('randomized index sessions', fontsize=font_size)
#plt.savefig('deterioration_KDA.png', dpi=300)
plt.show()
above_rand_bar_y_w =[0.0, 0.098891375631079581, 1.9668985368193823, -0.29231995748073775, 1.5211267605633785]
above_bar_y_w=[0.0, 5.1275210756589455, -3.7903603182030885, -8.2938388625592427, -11.062431544359256]
above_rand_bar_y_kda=[0.0, 0.096883076604551838, 1.4126766902006405, -0.2720597529709291, -0.6356256393628531]
above_bar_y_kda=[0.0, 1.8406555581309387, -2.9021423088618317, -4.5344283604762419, -7.3743866611672066]
below_bar_y_w= [0.0, 8.0568720379146868, 3.3333333333333179, -35.0, -23.07692307692307]
below_rand_bar_y_w= [0.0, 2.3041474654377807, 3.4482758620689724, 13.333333333333334, -21.428571428571423]
below_bar_y_kda=[0.0, 4.040344227742108, 3.3484269606076666, -20.001191957622233, -30.378195392012262]
below_rand_bar_y_kda=[0.0, -2.7673605007987367, -18.395179155755489, 43.152967294420975, -16.984859819786116]
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
above = fig.add_subplot(121)
below = fig.add_subplot(122)
above.bar(np.array(bar_x) - .1, above_bar_y_w, color = '#30a2da', width = .25, label = 'original sessions')
above.bar(np.array(bar_x) + .1, above_rand_bar_y_w, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')
above.set_title('High Experience Players', fontsize = font_size)
above.set_ylabel(r'$\Delta \% $ win rate', fontsize = font_size)
above.set_xlabel('session length', fontsize = font_size)
above.set_xlim(0.5,5.5)
above.set_ylim(-20,10)
above.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
above.set_xticks(bar_x, bar_x)
above.legend(loc = 'lower left')
below.bar(np.array(bar_x) - .1, below_bar_y_w, color = '#30a2da', width = .25, label = 'original sessions')
below.bar(np.array(bar_x) + .1, below_rand_bar_y_w, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')
below.set_title('Low Experience Players', fontsize = font_size)
below.set_ylabel(r'$\Delta \% $ win rate', fontsize = font_size)
below.set_xlabel('session length', fontsize = font_size)
below.set_xlim(0.5,5.5)
below.set_ylim(-40,20)
below.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
below.set_xticks(bar_x, bar_x)
#plt.savefig('deterioration_winner_above95below5.png',dpi=300)
font_size = 16
fig = plt.figure(figsize = (10,5))
fig.subplots_adjust(wspace=.25)
above = fig.add_subplot(121)
below = fig.add_subplot(122)
above.bar(np.array(bar_x) - .1, above_bar_y_kda, color = '#30a2da', width = .25, label = 'original sessions')
above.bar(np.array(bar_x) + .1, above_rand_bar_y_kda, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')
above.set_title('High Experience Players', fontsize = font_size)
above.set_ylabel(r'$\Delta \% $ KDA', fontsize = font_size)
above.set_xlabel('session length', fontsize = font_size)
above.set_xlim(0.5,5.5)
above.set_ylim(-10,10)
above.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
above.set_xticks(bar_x, bar_x)
above.legend(loc = 'lower left')
below.bar(np.array(bar_x) - .1, below_bar_y_kda, color = '#30a2da', width = .25, label = 'original sessions')
below.bar(np.array(bar_x) + .1, below_rand_bar_y_kda, color = '#fc4f30', hatch = '//', width = .25, label = 'randomized index sessions')
below.set_title('Low Experience Players', fontsize = font_size)
below.set_ylabel(r'$\Delta \% $ KDA', fontsize = font_size)
below.set_xlabel('session length', fontsize = font_size)
below.set_xlim(0.5,5.5)
below.set_ylim(-35,35)
below.axhline(y = 0, xmin=0, xmax = 6, linewidth=2, color = 'k', ls = 'dashed')
below.set_xticks(bar_x, bar_x)
#plt.savefig('deterioration_KDA_above95below5.png',dpi=300)
plt.style.use('seaborn-white')
font_size = 22
plt.rc('ytick',labelsize=14)
# compute original sessions
player_stats_top = playerid_stats[playerid_stats[('winner','count')] >= np.percentile(playerid_stats[('winner','count')], 95)]
sequences_top = []
for player_id in player_stats_top.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df_s10[df_s10['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
sequences_top.append(obj)
# compute original sessions
player_stats_worst = playerid_stats[playerid_stats[('winner','count')] <= np.percentile(playerid_stats[('winner','count')], 5)]
sequences_worst = []
for player_id in player_stats_worst.index:
old_id = 0
obj = Sequence()
obj.player_id = player_id
temp = df_s10[df_s10['player_id'] == player_id].reset_index(0).drop('index', 1)
for i in range(1,len(temp.index)):
dates = pd.to_datetime(temp['match_datetime'].ix[i - 1 : i])
diff = dates.diff().astype('timedelta64[s]') / 3600
duration = temp['match_duration'].ix[i - 1 : i] / 3600
delta = float(np.subtract(diff[1:], duration[:-1]))
if delta >= m:
df_copy = temp.ix[old_id:i-1].copy()
old_id = i
if len(df_copy.index) > 0:
obj.sequences.append(df_copy)
sequences_worst.append(obj)
top_mean_sessions = []
for player in range(len(sequences_top)):
session_length = []
for seq in sequences_top[player].sequences:
if len(seq.index) > 0:
session_length.append(len(seq.index))
if len(session_length)>0: top_mean_sessions.append(np.mean(session_length))
worst_mean_sessions = []
for player in range(len(sequences_worst)):
session_length = []
for seq in sequences_worst[player].sequences:
if len(seq.index) > 0:
session_length.append(len(seq.index))
if len(session_length)>0: worst_mean_sessions.append(np.mean(session_length))
# boxplot
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
test = mannwhitneyu(worst_mean_sessions, top_mean_sessions)
print test
print test.pvalue
if test.pvalue > 0.05:
print 'No significant difference :('
else:
print 'Significant difference detected :)'
fig = plt.figure(figsize = (7,5))
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_sessions, top_mean_sessions], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'], fontsize = 18)
ax.set_ylabel('Average Session Length',fontsize = font_size)
ax.set_yticks([1,2,3,4,5])
for box in bp['boxes']:
# change outline color
box.set( color='black', linewidth=2)
# change fill color
box.set( facecolor = 'white', alpha = .75 )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#333333', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#333333', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#fc4f30', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='s', markerfacecolor='#fc4f30')
fig.tight_layout()
#plt.savefig('average_session_all.png',dpi=300)
plt.show()
top_mean_duration = []
for player in range(len(sequences_top)):
session_duration = []
for seq in sequences_top[player].sequences:
if len(seq.index) > 0:
session_duration.append(np.sum(seq.match_duration))
if len(session_duration)>0:top_mean_duration.append( np.mean(session_duration) )
worst_mean_duration = []
for player in range(len(sequences_worst)):
session_duration = []
for seq in sequences_worst[player].sequences:
if len(seq.index) > 0:
session_duration.append(np.sum(seq.match_duration))
if len(session_duration)>0:worst_mean_duration.append( np.mean(session_duration) )
# boxplot
from scipy.stats import wilcoxon
from scipy.stats import mannwhitneyu
test = mannwhitneyu(worst_mean_duration, top_mean_duration)
print test
if test.pvalue > 0.05:
print 'No significant difference :('
else:
print 'Significant difference detected :)'
fig = plt.figure(figsize = (8,5.3))
ax = fig.add_subplot(111)
bp = ax.boxplot([worst_mean_duration, top_mean_duration], patch_artist=True, widths = .75)
ax.set_xticklabels(['Low Experience\nPlayers', 'High Experience\nPlayers'], fontsize = 18)
ax.set_ylabel('Average Session Duration', fontsize = font_size)
ax.set_yscale('log')
for box in bp['boxes']:
# change outline color
box.set( color='black', linewidth=2)
# change fill color
box.set( facecolor = 'white', alpha = .75 )
## change color and linewidth of the whiskers
for whisker in bp['whiskers']:
whisker.set(color='#333333', linewidth=2)
## change color and linewidth of the caps
for cap in bp['caps']:
cap.set(color='#333333', linewidth=2)
## change color and linewidth of the medians
for median in bp['medians']:
median.set(color='#fc4f30', linewidth=2)
## change the style of fliers and their fill
for flier in bp['fliers']:
flier.set(marker='s', markerfacecolor='#fc4f30')
fig.tight_layout()
#plt.savefig('average_duration_all.png',dpi=300)
plt.show()