
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np'''科比生涯数据'''
pd.set_option('display.height', 9999)
pd.set_option('display.max_rows', 9999)
pd.set_option('display.max_columns', 9999)
pd.set_option('display.width', 9999)raw = pd.read_csv("data.csv")
# print(raw.shape) #(30697, 25)
# print(raw.head())
'''action_type combined_shot_type  game_event_id   game_id      lat  loc_x  loc_y       lon  minutes_remaining  period  playoffs   season  seconds_remaining  shot_distance  shot_made_flag       shot_type         shot_zone_area  shot_zone_basic  shot_zone_range     team_id           team_name   game_date    matchup opponent  shot_id
0          Jump Shot          Jump Shot             10  20000012  33.9723    167     72 -118.1028                 10       1         0  2000-01                 27             18             NaN  2PT Field Goal          Right Side(R)        Mid-Range        16-24 ft.  1610612747  Los Angeles Lakers  2000-10-31  LAL @ POR      POR        1
1          Jump Shot          Jump Shot             12  20000012  34.0443   -157      0 -118.4268                 10       1         0  2000-01                 22             15             0.0  2PT Field Goal           Left Side(L)        Mid-Range         8-16 ft.  1610612747  Los Angeles Lakers  2000-10-31  LAL @ POR      POR        2
2          Jump Shot          Jump Shot             35  20000012  33.9093   -101    135 -118.3708                  7       1         0  2000-01                 45             16             1.0  2PT Field Goal   Left Side Center(LC)        Mid-Range        16-24 ft.  1610612747  Los Angeles Lakers  2000-10-31  LAL @ POR      POR        3
3          Jump Shot          Jump Shot             43  20000012  33.8693    138    175 -118.1318                  6       1         0  2000-01                 52             22             0.0  2PT Field Goal  Right Side Center(RC)        Mid-Range        16-24 ft.  1610612747  Los Angeles Lakers  2000-10-31  LAL @ POR      POR        4
4  Driving Dunk Shot               Dunk            155  20000012  34.0443      0      0 -118.2698                  6       2         0  2000-01                 19              0             1.0  2PT Field Goal              Center(C)  Restricted Area  Less Than 8 ft.  1610612747  Los Angeles Lakers  2000-10-31  LAL @ POR      POR        5
# shot_made_flag 是否进球
kobe = raw[pd.notnull(raw["shot_made_flag"])]
# print(kobe.shape)  # (25697, 25)plt.figure(figsize=(10, 10))# alpha = 0.02  # 透明度
# plt.subplot(121)
# plt.scatter(kobe["loc_x"], kobe["loc_y"], color="R", alpha=alpha)  # 球场中坐标
# plt.title("loc_x and loc_y")
# plt.subplot(122)
# plt.scatter(kobe["lon"], kobe["lat"], color="B", alpha=alpha)  # 经纬度
# plt.title("lon and lat")
# plt.show()# 极坐标,到圆心的距离+与X轴的夹角
# raw['dist'] = np.sqrt(raw['loc_x'] ** 2 + raw['loc_y'] ** 2)
# loc_x_zero = raw['loc_x'] == 0
# raw['angle'] = np.array([0] * len(raw))
# raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
# raw['angle'][loc_x_zero] = np.pi / 2
# raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']# print(kobe["action_type"].unique())
# print(kobe["combined_shot_type"].unique())
# print(kobe["shot_type"].unique())
# print(kobe["shot_type"].value_counts())# season 赛季
# print(kobe['season'].unique())
raw['season'] = raw['season'].apply(lambda x: int(x.split("-")[1]))
# print(raw['season'].unique())
# print(kobe['team_id'].unique())
# print(kobe['team_name'].unique())gs = kobe.groupby("shot_zone_area")
# print(kobe["shot_zone_area"].value_counts())
# print(len(gs))# print(kobe["shot_zone_area"].unique())
['Left Side(L)' 'Left Side Center(LC)' 'Right Side Center(RC)' 'Center(C)''Right Side(R)' 'Back Court(BC)']
# print(kobe["shot_zone_basic"].unique())
['Mid-Range' 'Restricted Area' 'In The Paint (Non-RA)' 'Above the Break 3''Right Corner 3' 'Backcourt' 'Left Corner 3']
# print(kobe["shot_zone_range"].unique())
'''['8-16 ft.' '16-24 ft.' 'Less Than 8 ft.' '24+ ft.' 'Back Court Shot']'''import matplotlib.cm as cmplt.figure(figsize=(20, 10))def scatter_plot_by_category(feat):alpha = 0.1gs = kobe.groupby(feat)cs = cm.rainbow(np.linspace(0, 1, len(gs)))for g, c in zip(gs, cs):plt.scatter(g[1]["loc_x"], g[1]["loc_y"], color=c, alpha=alpha)# # shot_zone_area
# plt.subplot(131)
# scatter_plot_by_category('shot_zone_area')
# plt.title('shot_zone_area')
# # shot_zone_basic
# plt.subplot(132)
# scatter_plot_by_category('shot_zone_basic')
# plt.title('shot_zone_basic')
# # shot_zone_range
# plt.subplot(133)
# scatter_plot_by_category('shot_zone_range')
# plt.title('shot_zone_range')
# plt.show()drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic','matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining','shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']
for drop in drops:raw.drop(drop, axis=1, inplace=True)
# print(raw.head())
'''action_type combined_shot_type  period  playoffs  season  shot_made_flag       shot_type opponent
0          Jump Shot          Jump Shot       1         0       1             NaN  2PT Field Goal      POR
1          Jump Shot          Jump Shot       1         0       1             0.0  2PT Field Goal      POR
2          Jump Shot          Jump Shot       1         0       1             1.0  2PT Field Goal      POR
3          Jump Shot          Jump Shot       1         0       1             0.0  2PT Field Goal      POR
4  Driving Dunk Shot               Dunk       2         0       1             1.0  2PT Field Goal      POR
# print(raw['combined_shot_type'].value_counts())
# dummies_cs_type = pd.get_dummies(raw['combined_shot_type'], prefix='cs_type')
# raw = pd.concat([raw, dummies_cs_type], axis=1)
# raw = raw.drop("combined_shot_type", axis=1)
# print(raw.head())
categorical_vars = ['action_type', 'combined_shot_type', 'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:raw = pd.concat([raw, pd.get_dummies(raw[var], prefix=var)], axis=1)raw = raw.drop(var, 1)
print(raw.head())train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
train_label = train_kobe['shot_made_flag']
train_kobe = train_kobe.drop('shot_made_flag', axis=1)
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe = test_kobe.drop('shot_made_flag', 1)from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix, log_loss
import time
import numpy as np
# find the best n_estimators for RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFoldprint('Finding best n_estimators for RandomForestClassifier...')
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0, 2, num=3).astype(int)
for n in range_n:  # 树的个数print("the number of trees : {0}".format(n))t1 = time.time()rfc_score = 0.rfc = RandomForestClassifier(n_estimators=n)for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])# rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10pred = rfc.predict(train_kobe.iloc[test_k])rfc_score += log_loss(train_label.iloc[test_k], pred) / 10scores_n.append(rfc_score)if rfc_score < min_score:min_score = rfc_scorebest_n = nt2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(n, t2 - t1))
print(best_n, min_score)# find best max_depth for RandomForestClassifier
print('Finding best max_depth for RandomForestClassifier...')
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0, 2, num=3).astype(int)
for m in range_m:  # 树的深度print("the max depth : {0}".format(m))t1 = time.time()rfc_score = 0.rfc = RandomForestClassifier(max_depth=m, n_estimators=best_n)for train_k, test_k in KFold(len(train_kobe), n_folds=10, shuffle=True):rfc.fit(train_kobe.iloc[train_k], train_label.iloc[train_k])# rfc_score += rfc.score(train.iloc[test_k], train_y.iloc[test_k])/10pred = rfc.predict(train_kobe.iloc[test_k])rfc_score += log_loss(train_label.iloc[test_k], pred) / 10scores_m.append(rfc_score)if rfc_score < min_score:min_score = rfc_scorebest_m = mt2 = time.time()print('Done processing {0} trees ({1:.3f}sec)'.format(m, t2 - t1))
print(best_m, min_score)
plt.figure(figsize=(10, 5))
plt.plot(range_n, scores_n)
plt.xlabel('number of trees')plt.subplot(122)
plt.plot(range_m, scores_m)
plt.xlabel('max depth')
plt.show()model = RandomForestClassifier(n_estimators=best_n, max_depth=best_m)
model.fit(train_kobe, train_label)


