文章目录

  • Solution
  • 总结

这是前段时间参加的天池比赛,最终成绩:18/3275。 下面先给出我的solution,最后再写一点总结。

Solution


# coding: utf-8# In[1]:import numpy as np
import pandas as pd
import lightgbm as lgb
import math
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import os
import random
np.random.seed(78)
random.seed(78)# In[2]:features = []
def dis_lat_lon(lat1, lon1, lat2, lon2):R = 6373.0lat1 = math.radians(lat1)lon1 = math.radians(lon1)lat2 = math.radians(lat2)lon2 = math.radians(lon2)dlon = lon2 - lon1dlat = lat2 - lat1a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))distance = R * creturn distancedef produce_feature(df, train):# 造统计数量的特征,train为时间段总数,df为某时间段中某航速段nums = len(df)mean = np.mean(df['速度'])ratio = len(df) / len(train) if len(train) != 0 else 0std = np.std(df['速度'])v_ = df['速度'].quantile(0.75)return nums, mean, ratio, std, v_def angle(a, b, c):# 计算空间中,连续三点所形成的角度ab = [aa - bb for aa, bb in zip(a, b)]bc = [bb - cc for cc, bb in zip(c, b)]nab = np.sqrt(sum((x ** 2.0 for x in ab)))ab = [x / nab for x in ab]nbc = np.sqrt(sum((x ** 2.0 for x in bc)))bc = [x / nbc for x in bc]scal = sum((aa * bb for aa, bb in zip(ab, bc)))if scal > 1:scal = 1elif scal < -1:scal = -1angle = int(math.acos(scal) * 180 / math.pi)angle = 180 - anglereturn angledef produce_feature_v_xy(df):# 造统计各时间段坐标信息k=df['y']/df['x']k_min = k.min()k_max = k.max()#k_mean = k.mean()#x_50_ = df['x'].quantile(0.5)x_min_ = df['x'].min()x_max_ = df['x'].max()y_min_ = df['y'].min()y_max_ = df['y'].max()x_max_y_min_ = df['x'].max() - df['y'].min()y_max_x_min_ = df['y'].max() - df['x'].min()x_25_ = df['x'].quantile(0.25)y_75_ = df['y'].quantile(0.75)if len(df) <= 1:xy_cov_ = 0else:xy_cov_ = df['x'].cov(df['y'])df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')t_diff = df['time'].diff().iloc[1:].dt.total_seconds()x_diff = df['x'].diff().iloc[1:].abs()y_diff = df['y'].diff().iloc[1:].abs()x_a_mean = (x_diff / t_diff).mean()y_a_mean = (y_diff / t_diff).mean()xy_a_ = np.sqrt(x_a_mean ** 2 + y_a_mean ** 2)return k_min,k_max,x_min_, x_max_, y_min_, y_max_, x_max_y_min_, y_max_x_min_, x_25_, y_75_, xy_cov_, xy_a_def produce_feature_ang_ext(df):# 构造角度,距离等特征###df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')df = df.sort_values(by='time')df['hour'] = df['time'].dt.hourdf_tortuosity = df[['x', 'y', '方向', '速度', 'hour']].values.tolist()if len(df_tortuosity) > 1:ang_list = [0]dis_list = [0]for i in range(1, len(df_tortuosity) - 1):a = [df_tortuosity[i - 1][0], df_tortuosity[i - 1][1]]b = [df_tortuosity[i][0], df_tortuosity[i][1]]c = [df_tortuosity[i + 1][0], df_tortuosity[i + 1][1]]
#             dis = np.sqrt((float((a[0] - b[0]) ** 2) + float((a[1] - b[1]) ** 2)))dis = dis_lat_lon(a[0], a[1], b[0], b[1])dis_list.append(dis)if a == b or b == c or a == c:ang_list.append(0)else:res = angle(a, b, c)ang_list.append(int(res))#         dis_list.append(np.sqrt((float((df_tortuosity[-1][0] - df_tortuosity[-2][0]) ** 2) + float(
#             (df_tortuosity[-1][1] - df_tortuosity[-2][1]) ** 2))))last_dis = dis_lat_lon(df_tortuosity[-1][0], df_tortuosity[-1][1], df_tortuosity[-2][0], df_tortuosity[-2][1])dis_list.append(last_dis)ang_list.append(int(ang_list[-1]))num_ang_all = len(ang_list)num_ang_0_100 = len([x for x in ang_list if x <= 100])ratio_ang_0_100 = num_ang_0_100 / num_ang_allnum_ang_10_150 = len([x for x in ang_list if x > 10 and x < 150])ratio_ang_10_150 = num_ang_10_150 / num_ang_allnum_ang_100_165 = len([x for x in ang_list if x > 100 and x < 165])ratio_ang_100_165 = num_ang_100_165 / num_ang_alldf['est_d'] = ang_listdf['est_dis'] = dis_listt_diff = df['time'].diff().iloc[1:].dt.total_seconds()t = [0]t.extend(t_diff.values.tolist())df['est_t'] = [x / 3600 for x in t]df['est_v'] = df['est_d'] / df['est_t']beg_end = dis_lat_lon(df_tortuosity[0][0], df_tortuosity[0][1], df_tortuosity[-1][0], df_tortuosity[-1][1])elif len(df_tortuosity) == 1:return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 # , 0, 0, 0else:return 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 #, 0, 0, 0return df['est_v'].mean(), df['est_v'].std(), df['est_v'].quantile(0.75), df['est_d'].mean(), num_ang_0_100, ratio_ang_0_100, num_ang_10_150, ratio_ang_10_150, num_ang_100_165, ratio_ang_100_165, beg_enddef x_y_area_count(df,all_df):num_all = len(all_df)num_ = len(df)num_ratio_ = num_/num_allv_mean_c_ = df['速度'].mean()v_std_c_ = df['速度'].std()d_mean_c_ = df['方向'].mean()#x_mean_c = df['x'].mean()#x_max_ =return [num_,num_ratio_,v_mean_c_,v_std_c_,d_mean_c_]def read_information(path, know_type=True):df = pd.read_csv(path)
#     print(path)if know_type:df.columns = ['ship', 'x', 'y', '速度', '方向',  'time', 'type']else:df.columns = ['ship', 'x', 'y', '速度', '方向', 'time']# 构造角度,距离等特征###df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')df = df.sort_values(by='time')df['hour'] = df['time'].dt.hourdf_tortuosity = df[['x', 'y', '速度', '方向', 'hour']].values.tolist()ang_list = [0]dis_list = [0]for i in range(1, len(df_tortuosity) - 1):a = [df_tortuosity[i - 1][0], df_tortuosity[i - 1][1]]b = [df_tortuosity[i][0], df_tortuosity[i][1]]c = [df_tortuosity[i + 1][0], df_tortuosity[i + 1][1]]
#         dis = np.sqrt((float((a[0] - b[0]) ** 2) + float((a[1] - b[1]) ** 2)))dis = dis_lat_lon(a[0], a[1], b[0], b[1])dis_list.append(dis)if a == b or b == c or a == c:ang_list.append(0)else:res = angle(a, b, c)ang_list.append(int(res))
#     dis_list.append(np.sqrt((float((df_tortuosity[-1][0] - df_tortuosity[-2][0]) ** 2) + float((df_tortuosity[-1][1] - df_tortuosity[-2][1]) ** 2))))dis_list.append(dis_lat_lon(df_tortuosity[-1][0], df_tortuosity[-1][1], df_tortuosity[-2][0], df_tortuosity[-2][1]))ang_list.append(int(ang_list[-1]))num_ang_all = len(ang_list)num_ang_0_100 = len([x for x in ang_list if x <= 100])ratio_ang_0_100 = num_ang_0_100 / num_ang_allnum_ang_10_150 = len([x for x in ang_list if x > 10 and x < 150])ratio_ang_10_150 = num_ang_10_150 / num_ang_allnum_ang_100_165 = len([x for x in ang_list if x > 100 and x < 165])ratio_ang_100_165 = num_ang_100_165 / num_ang_alldf['est_d'] = ang_listdf['d_diff'] = df['est_d'] - df['方向']df['est_dis'] = dis_listt_diff = df['time'].diff().iloc[1:].dt.total_seconds()t = [0]t.extend(t_diff.values.tolist())df['est_t'] = [x / 3600 for x in t]df['est_v_dis'] = df['est_dis'] / df['est_t'] # 这个是est_vdf['est_v_d'] = df['est_d'] / df['est_t'] # 这个是角速度df['v_diff'] = df['速度'] - df['est_v_dis']
#     beg_end = np.sqrt((float((df_tortuosity[0][0] - df_tortuosity[-1][0]) ** 2) + float(
#         (df_tortuosity[0][1] - df_tortuosity[-1][1]) ** 2)))beg_end = dis_lat_lon(df_tortuosity[0][0], df_tortuosity[0][1], df_tortuosity[-1][0], df_tortuosity[-1][1])features.append(int(df['ship'].unique()))features.append(df['est_v_d'].mean())features.append(df['est_v_d'].std())features.append(df['est_v_d'].quantile(0.75))features.append(df['v_diff'].mean())# features.append(df['v_diff'].max())# features.append(df['v_diff'].std())# features.append(df['v_diff'].quantile(0.75))features.append(df['d_diff'].mean())features.append(df['d_diff'].max())features.append(df['d_diff'].min())# features.append(df['d_diff'].std())# features.append(df['d_diff'].quantile(0.75))features.append(df['est_d'].mean())features.append(num_ang_0_100)features.append(ratio_ang_0_100)features.append(num_ang_10_150)features.append(ratio_ang_10_150)features.append(num_ang_100_165)features.append(ratio_ang_100_165)features.append(beg_end)night1 = df[19 <= df['hour']]night1 = night1[night1['hour'] < 23]night2_1 = df[23 <= df['hour']]night2_2 = df[df['hour'] <= 3]night2 = pd.concat([night2_1, night2_2], axis=0)night = pd.concat([night1, night2_1, night2_2], axis=0)day1 = df[3 < df['hour']]day1 = day1[day1['hour'] < 10]day2 = df[10 <= df['hour']]day2 = day2[day2['hour'] < 16]day3 = df[16 <= df['hour']]day3 = day3[day3['hour'] < 19]day = pd.concat([day1, day2, day3], axis=0)#根据时间段划分后再统计k_min_1,k_max_1,x_min_n_1, x_max_n_1, y_min_n_1, y_max_n_1, x_max_y_min_n_1, y_max_x_min_n_1, x_25_n_1, y_75_n_1, xy_cov_n_1, xy_a_n_1 = produce_feature_v_xy(night1)k_min_2,k_max_2,x_min_n_2, x_max_n_2, y_min_n_2, y_max_n_2, x_max_y_min_n_2, y_max_x_min_n_2, x_25_n_2, y_75_n_2, xy_cov_n_2, xy_a_n_2 = produce_feature_v_xy(night2)k_min_3,k_max_3,x_min_d_1, x_max_d_1, y_min_d_1, y_max_d_1, x_max_y_min_d_1, y_max_x_min_d_1, x_25_d_1, y_75_d_1, xy_cov_d_1, xy_a_d_1 = produce_feature_v_xy(day1)k_min_4,k_max_4,x_min_d_2, x_max_d_2, y_min_d_2, y_max_d_2, x_max_y_min_d_2, y_max_x_min_d_2, x_25_d_2, y_75_d_2, xy_cov_d_2, xy_a_d_2 = produce_feature_v_xy(day2)k_min_5,k_max_5,x_min_d_3, x_max_d_3, y_min_d_3, y_max_d_3, x_max_y_min_d_3, y_max_x_min_d_3, x_25_d_3, y_75_d_3, xy_cov_d_3, xy_a_d_3 = produce_feature_v_xy(day3)features.extend([k_min_1,k_max_1,x_min_n_1, x_max_n_1, y_min_n_1, y_max_n_1, x_max_y_min_n_1, y_max_x_min_n_1, x_25_n_1, y_75_n_1, xy_cov_n_1, xy_a_n_1])features.extend([k_min_2,k_max_2,x_min_n_2, x_max_n_2, y_min_n_2, y_max_n_2, x_max_y_min_n_2, y_max_x_min_n_2, x_25_n_2, y_75_n_2, xy_cov_n_2,xy_a_n_2])features.extend([k_min_3,k_max_3,x_min_d_1, x_max_d_1, y_min_d_1, y_max_d_1, x_max_y_min_d_1, y_max_x_min_d_1, x_25_d_1, y_75_d_1, xy_cov_d_1,xy_a_d_1])features.extend([k_min_4,k_max_4,x_min_d_2, x_max_d_2, y_min_d_2, y_max_d_2, x_max_y_min_d_2, y_max_x_min_d_2, x_25_d_2, y_75_d_2, xy_cov_d_2,xy_a_d_2])features.extend([k_min_5,k_max_5,x_min_d_3, x_max_d_3, y_min_d_3, y_max_d_3, x_max_y_min_d_3, y_max_x_min_d_3, x_25_d_3, y_75_d_3, xy_cov_d_3,xy_a_d_3])k_min_n,k_max_n,x_min_n_, x_max_n_, y_min_n_, y_max_n_, x_max_y_min_n_, y_max_x_min_n_, x_25_n_, y_75_n_, xy_cov_n_, xy_a_n_ = produce_feature_v_xy(night)k_min_d,k_max_d,x_min_d_, x_max_d_, y_min_d_, y_max_d_, x_max_y_min_d_, y_max_x_min_d_, x_25_d_, y_75_d_, xy_cov_d_, xy_a_d_ = produce_feature_v_xy(day)features.extend([k_min_n,k_max_n,x_min_n_, x_max_n_, y_min_n_, y_max_n_, x_max_y_min_n_, y_max_x_min_n_, x_25_n_, y_75_n_, xy_cov_n_, xy_a_n_])features.extend([k_min_d,k_max_d,x_min_d_, x_max_d_, y_min_d_, y_max_d_, x_max_y_min_d_, y_max_x_min_d_, x_25_d_, y_75_d_, xy_cov_d_, xy_a_d_])###细分角度等特征###est_v_m_1, est_v_s_1, est_v_75_1, est_d_1, num_ang_0_100_1, ratio_ang_0_100_1, num_ang_10_150_1, ratio_ang_10_150_1, num_ang_100_165_1, ratio_ang_100_165_1, beg_end_1 = produce_feature_ang_ext(night)est_v_m_2, est_v_s_2, est_v_75_2, est_d_2, num_ang_0_100_2, ratio_ang_0_100_2, num_ang_10_150_2, ratio_ang_10_150_2, num_ang_100_165_2, ratio_ang_100_165_2, beg_end_2 = produce_feature_ang_ext(day)features.extend([est_v_m_1, est_v_s_1, est_v_75_1, est_d_1, num_ang_0_100_1, ratio_ang_0_100_1, num_ang_10_150_1,ratio_ang_10_150_1, num_ang_100_165_1, ratio_ang_100_165_1, beg_end_1])features.extend([est_v_m_2, est_v_s_2, est_v_75_2, est_d_2, num_ang_0_100_2, ratio_ang_0_100_2, num_ang_10_150_2,ratio_ang_10_150_2, num_ang_100_165_2, ratio_ang_100_165_2, beg_end_2])#全局统计特征features.append(df['x'].min())features.append(df['x'].max())features.append(df['x'].mean())features.append(df['x'].quantile(0.25))features.append(df['y'].min())features.append(df['y'].max())features.append(df['y'].mean())features.append(df['y'].quantile(0.75))features.append(df['x'].cov(df['y']))df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')t_diff = df['time'].diff().iloc[1:].dt.total_seconds()x_diff = df['x'].diff().iloc[1:].abs()y_diff = df['y'].diff().iloc[1:].abs()dis = sum(np.sqrt(x_diff ** 2 + y_diff ** 2))x_a_mean = (x_diff / t_diff).mean()y_a_mean = (y_diff / t_diff).mean()features.append(np.sqrt(x_a_mean ** 2 + y_a_mean ** 2))features.append(df['速度'].mean())features.append(df['速度'].std())features.append(df['速度'].quantile(0.75))features.append(df['方向'].mean())if(know_type):if(df["type"].iloc[0] == '拖网'):features.append(2)if(df["type"].iloc[0] == '刺网'):features.append(1)if(df["type"].iloc[0] == '围网'):features.append(0)# In[3]:train_path = r"hy_round2_train_20200225/"
train_files = os.listdir(train_path)
train_files = list(np.sort(train_files))
length_tr = len(train_files)
for files in train_files:path = train_path + str(files)read_information(path, know_type=True)
train_data = pd.DataFrame(np.array(features).reshape(length_tr, int(len(features) / length_tr)))
train_data.columns = ['ship', 'est_v_m', 'est_v_s', 'est_v_75', 'v_diff_mean', 'd_diff_mean', 'd_diff_max', 'd_diff_min', 'est_d', 'num_ang_0_100', 'ratio_ang_0_100', 'num_ang_10_150','ratio_ang_10_150', 'num_ang_100_165', 'ratio_ang_100_165', 'beg_end','k_min_1','k_max_1','x_min_n_1', 'x_max_n_1', 'y_min_n_1', 'y_max_n_1', 'x_max_y_min_n_1', 'y_max_x_min_n_1','x_25_n_1', 'y_75_n_1', 'xy_cov_n_1', 'xy_a_n_1','k_min_2','k_max_2','x_min_n_2', 'x_max_n_2', 'y_min_n_2', 'y_max_n_2', 'x_max_y_min_n_2', 'y_max_x_min_n_2','x_25_n_2', 'y_75_n_2', 'xy_cov_n_2', 'xy_a_n_2','k_min_3','k_max_3','x_min_d_1', 'x_max_d_1', 'y_min_d_1', 'y_max_d_1', 'x_max_y_min_d_1', 'y_max_x_min_d_1','x_25_d_1', 'y_75_d_1', 'xy_cov_d_1', 'xy_a_d_1','k_min_4','k_max_4','x_min_d_2', 'x_max_d_2', 'y_min_d_2', 'y_max_d_2', 'x_max_y_min_d_2', 'y_max_x_min_d_2','x_25_d_2', 'y_75_d_2', 'xy_cov_d_2', 'xy_a_d_2','k_min_5','k_max_5','x_min_d_3', 'x_max_d_3', 'y_min_d_3', 'y_max_d_3', 'x_max_y_min_d_3', 'y_max_x_min_d_3','x_25_d_3', 'y_75_d_3', 'xy_cov_d_3', 'xy_a_d_3','k_min_n','k_max_n','x_min_n_', 'x_max_n_', 'y_min_n_', 'y_max_n_', 'x_max_y_min_n_', 'y_max_x_min_n_','x_25_n_', 'y_75_n_', 'xy_cov_n_', 'xy_a_n_','k_min_d','k_max_d','x_min_d_', 'x_max_d_', 'y_min_d_', 'y_max_d_', 'x_max_y_min_d_', 'y_max_x_min_d_','x_25_d_', 'y_75_d_', 'xy_cov_d_', 'xy_a_d_','est_v_m_1', 'est_v_s_1', 'est_v_75_1', 'est_d_1', 'num_ang_0_100_1', 'ratio_ang_0_100_1','num_ang_10_150_1', 'ratio_ang_10_150_1', 'num_ang_100_165_1', 'ratio_ang_100_165_1', 'beg_end_1', 'est_v_m_2', 'est_v_s_2', 'est_v_75_2', 'est_d_2', 'num_ang_0_100_2', 'ratio_ang_0_100_2','num_ang_10_150_2', 'ratio_ang_10_150_2', 'num_ang_100_165_2', 'ratio_ang_100_165_2', 'beg_end_2', 'x_min', 'x_max', 'x_mean', 'x_1/4', 'y_min', 'y_max', 'y_mean', 'y_3/4', 'xy_cov', 'a','v_mean', 'v_std', 'v_3/4', 'd_mean', 'type']
train_data.fillna(0, inplace=True)# In[5]:features = []
test_path = r'hy_round2_testA_20200225/'
test_files = os.listdir(test_path)
test_files = list(np.sort(test_files))
length_te = len(test_files)
for files in test_files:path = test_path + str(files)read_information(path, know_type=False)
test_data = pd.DataFrame(np.array(features).reshape(length_te, int(len(features) / length_te)))
test_data.columns = ['ship', 'est_v_m', 'est_v_s', 'est_v_75', 'v_diff_mean', 'd_diff_mean', 'd_diff_max', 'd_diff_min', 'est_d', 'num_ang_0_100', 'ratio_ang_0_100', 'num_ang_10_150','ratio_ang_10_150', 'num_ang_100_165', 'ratio_ang_100_165', 'beg_end','k_min_1','k_max_1','x_min_n_1', 'x_max_n_1', 'y_min_n_1', 'y_max_n_1', 'x_max_y_min_n_1', 'y_max_x_min_n_1','x_25_n_1', 'y_75_n_1', 'xy_cov_n_1', 'xy_a_n_1','k_min_2','k_max_2','x_min_n_2', 'x_max_n_2', 'y_min_n_2', 'y_max_n_2', 'x_max_y_min_n_2', 'y_max_x_min_n_2','x_25_n_2', 'y_75_n_2', 'xy_cov_n_2', 'xy_a_n_2','k_min_3','k_max_3','x_min_d_1', 'x_max_d_1', 'y_min_d_1', 'y_max_d_1', 'x_max_y_min_d_1', 'y_max_x_min_d_1','x_25_d_1', 'y_75_d_1', 'xy_cov_d_1', 'xy_a_d_1','k_min_4','k_max_4','x_min_d_2', 'x_max_d_2', 'y_min_d_2', 'y_max_d_2', 'x_max_y_min_d_2', 'y_max_x_min_d_2','x_25_d_2', 'y_75_d_2', 'xy_cov_d_2', 'xy_a_d_2','k_min_5','k_max_5','x_min_d_3', 'x_max_d_3', 'y_min_d_3', 'y_max_d_3', 'x_max_y_min_d_3', 'y_max_x_min_d_3','x_25_d_3', 'y_75_d_3', 'xy_cov_d_3', 'xy_a_d_3','k_min_n','k_max_n','x_min_n_', 'x_max_n_', 'y_min_n_', 'y_max_n_', 'x_max_y_min_n_', 'y_max_x_min_n_','x_25_n_', 'y_75_n_', 'xy_cov_n_', 'xy_a_n_','k_min_d','k_max_d','x_min_d_', 'x_max_d_', 'y_min_d_', 'y_max_d_', 'x_max_y_min_d_', 'y_max_x_min_d_','x_25_d_', 'y_75_d_', 'xy_cov_d_', 'xy_a_d_','est_v_m_1', 'est_v_s_1', 'est_v_75_1', 'est_d_1', 'num_ang_0_100_1', 'ratio_ang_0_100_1','num_ang_10_150_1', 'ratio_ang_10_150_1', 'num_ang_100_165_1', 'ratio_ang_100_165_1', 'beg_end_1', 'est_v_m_2', 'est_v_s_2', 'est_v_75_2', 'est_d_2', 'num_ang_0_100_2', 'ratio_ang_0_100_2','num_ang_10_150_2', 'ratio_ang_10_150_2', 'num_ang_100_165_2', 'ratio_ang_100_165_2', 'beg_end_2', 'x_min', 'x_max', 'x_mean', 'x_1/4', 'y_min', 'y_max', 'y_mean', 'y_3/4', 'xy_cov', 'a','v_mean', 'v_std', 'v_3/4', 'd_mean']
test_data.fillna(0, inplace=True)# In[4]:kind = train_data.type
train_data = train_data.drop('type', axis=1)features = [x for x in train_data.columns]
train_data = train_data[features]
#test_data = test_data[features]# In[5]:x_train, x_test, y_train, y_test = train_test_split(train_data, kind, test_size=0.1, random_state=78)# In[6]:params = {'learning_rate': 0.2036,'max_depth': 6,  # .6787,'boosting': 'gbdt','objective': 'multiclass','n_estimators': 5561,'num_class': 3,'feature_fraction': .5242,'bagging_fraction': .3624,'class_weight': {0: 3, 1: 5, 2: 2.5},'seed':78# 'early_stopping_rounds': 100
}llf = lgb.LGBMClassifier(**params)
llf.fit(x_train, y_train)
weight_lgb = f1_score(y_test, llf.predict(x_test), average='macro')details = []
answers = []
scores = []
sk = StratifiedKFold(n_splits=20, shuffle=True, random_state=2020)
for train, test in sk.split(train_data, kind):x_train = train_data.iloc[train]y_train = kind.iloc[train]x_test = train_data.iloc[test]y_test = kind.iloc[test]llf.fit(x_train, y_train)pred_llf = llf.predict(x_test)weight_lgb = f1_score(y_test, pred_llf, average='macro')prob_lgb = llf.predict_proba(x_test)prob_end = prob_lgbscore = f1_score(y_test, np.argmax(prob_end, axis=1), average='macro')scores.append(score)details.append(score)details.append(weight_lgb)#answers.append(llf.predict(test_data))print('score: ', score)
print(np.mean(details))# In[7]:#print(answers)
#使用贝叶斯优化调参
from sklearn import metrics
params = {'learning_rate': 0.2036,'max_depth': 6,#.6787,'boosting': 'gbdt','objective': 'multiclass','n_estimators': 5561,'num_class': 3,'feature_fraction': .5242,'bagging_fraction': .3624,'early_stopping_rounds': 100
}
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)X = train_data.copy()
y = kind
models = []
#pred = np.zeros((len(test_data),3))
oof = np.zeros((len(X), 3))
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])#print(y.iloc[train_idx])model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)models.append(model)#print(X.iloc[val_idx][0:5])val_pred = model.predict(X.iloc[val_idx])oof[val_idx] = val_pred#print('val_pred',val_pred[0:5])val_y = y.iloc[val_idx]val_pred = np.argmax(val_pred, axis=1)#print('val_y',val_y[0:5])#print('val_pred',val_pred[0:5])print(index, 'val f1', metrics.f1_score(val_y, val_pred, average='macro'))# 0.8695539641133697# 0.8866211724839532#test_pred = model.predict(test_data)#pred += test_pred/5
oof = np.argmax(oof, axis=1)
print('oof f1', metrics.f1_score(oof, y, average='macro'))
# 0.8701544575329372# In[8]:x_train, x_test, y_train, y_test = train_test_split(train_data, kind, test_size=0.1, random_state=78)
params = {'learning_rate': 0.2036,'max_depth': 6,  # .6787,'boosting': 'gbdt','objective': 'multiclass','n_estimators': 5561,'num_class': 3,'feature_fraction': .5242,'bagging_fraction': .3624,'class_weight': {0: 3, 1: 5, 2: 2.5},'seed':78# 'early_stopping_rounds': 100
}llf = lgb.LGBMClassifier(**params)
llf.fit(x_train, y_train)
weight_lgb = f1_score(y_test, llf.predict(x_test), average='macro')
print(weight_lgb)

总结

比赛理解:各种合理有效交叉特征可以批量扩展,结合业务背景往往比较精准。造特征要批量进行(别一个个,别试图学习test),但要注意,随时考虑过拟合(即考虑线下线上,ab榜数据可能区别,避免造只适合train的特征),合理的验证:

比赛总结:

  1. 探索数据可视化与清洗(极大过拟合,利用了label清洗(作弊),没有考虑线上线下要相同处理问题)。
  2. 探索利用数据的方式,直接用上全部数据(过多噪音,过拟合严重)
  3. 开始进行正规,造特征,但是未找到有效造特征方式
  4. 在依据(论文,咨询渔民)基础上,有依据地造特征
  5. 尝试了一些工作:
    • 模型融合(xgb、cat、lgb)一点点效果
    • 特征选择—根据feature重要性选择—无效,RFE选择—无效
    • 数据清洗—有效(删除设备故障导致的数据)
    • 贝叶斯优化—提高6-7个千分点(kaggle上薅来的代码)
    • 样本不平衡处理:lgb,xgb参数—有效(调整权重);smote等方式—无效
  6. 造特征工作:
    • 引入一些统计性特征:有效
    • 按时间段划分特征:有效 (不同类特征,按照不同时间段划分)
    • 拐角类特征:有效
    • 魔法组合特征:有效
    • 按日夜细分拐角类,按五个时间段细分坐标类有效(但其它划分无效)
    • 调节阀值(无效)
    • 引入意义接近但不够精准的特征,无效
    • 距离类特征:无效
  7. 需要改进工作:
    • 线下线上不一致,需要可靠验证方式(最重要点之一)
    • 融合需要nn等差异大模型
    • 特征选择
    • ab榜数据区别没考虑,导致造的绝对特征过拟合
    • 特征选择未有特别有效的方法
  8. 其他队伍有效方法:
    • 预判ab榜差异(造相对特征而不是绝对特征)
    • 造地图匹配特征
    • 海岸线距离特征
    • 两层验证更有效
    • 使用auc指标考察线上线下gap
    • word2vec
    • 使用nn融合
    • 批量造特征,防过拟合
    • embedding技术

队友总结:

  1. 8729: 对原始数据(x, y, v, d, t)进行统计(‘max’,‘min’,‘mean’,‘std’,‘skew’,‘sum’),lgb, (另加xgb+lgb+cgb)
  2. 全信息:xy+hour+weekend 到 lgb
  3. 8729 + xy_by_t(细分、总分)+ v_by_t + xy_by_v + 最大最小xy的面积
  4. 加入轨迹特征(拐角、两点距离、est_v),不同阈值的划分和统计;和在t上的划分
  5. 经纬度统计,v_diff, d_diff
  6. 魔法特征
  7. 特征选择: RFE

2020数字中国 天池——智慧海洋建设 Top1%相关推荐

  1. 【天池智慧海洋建设】Topline源码——特征工程学习(总结)

    [天池智慧海洋建设]Topline源码--特征工程学习 目录 [天池智慧海洋建设]Topline源码--特征工程学习 前言 学习来源 部分解释 学习目标 内容介绍 I 特征工程概述 I 数据部分 II ...

  2. 【天池智慧海洋建设】Topline源码——特征工程学习(大白)

    [天池智慧海洋建设]Topline源码--特征工程学习 团队名称:大白 链接: https://github.com/Ai-Light/2020-zhihuihaiyang 目录 [天池智慧海洋建设] ...

  3. 【天池智慧海洋建设】Topline源码——特征工程学习(天才海神号)

    [天池智慧海洋建设]Topline源码--特征工程学习 团队名称:天才海神号 链接: https://github.com/fengdu78/tianchi_haiyang?spm=5176.1228 ...

  4. 【竞赛】智能算法赛:智慧海洋建设Top1方案代码

    海上安全治理是海洋发展中至关重要的环节,了解各个区域船只的工作情况以及具体位置,可以对于防止因为船只的碰撞等事故而造成的巨大损失,而要提升海上安全治理能力,首要任务是"看得清",即 ...

  5. 【比赛实战篇】智能算法赛:智慧海洋建设Top1方案代码

    海上安全治理是海洋发展中至关重要的环节,了解各个区域船只的工作情况以及具体位置,可以对于防止因为船只的碰撞等事故而造成的巨大损失,而要提升海上安全治理能力,首要任务是"看得清",即 ...

  6. 智慧海洋建设TOP方案借鉴学习与整理

    文章目录 数据探索与预处理 渔船作业方式的定义 渔船作业过程中的三种状态 预处理 特征工程 统计特征 表征渔船的轨迹 POI信息 基于轨迹序列绝对和相对位置的复合向量编码 表征渔船不同状态下的信息 A ...

  7. 智慧海洋建设TOP_1方案学习笔记

    文章目录 赛题 要求 数据 TOP1 解决方案 数据读取 特征工程 基本预处理 统计特征 基于轨迹序列绝对和相对位置的复合向量编码 Word2Vec 比赛来源:天池平台 智慧海洋建设 感谢大佬开源TO ...

  8. 【算法实战篇】时序多分类赛题-2020数字中国创新大赛-智慧海洋建设top5方案(含源码)

        Hi,大家好!这里是AILIGHT!AI light the world!这次给大家带来的是2020数字中国创新大赛-数字政府赛道-智能算法赛:智慧海洋建设的算法赛复赛赛道B top5的方案以 ...

  9. 【时序多分类赛题】2020数字中国创新大赛-智慧海洋建设top5方案(含源码)

       这次给大家带来的是2020数字中国创新大赛-数字政府赛道-智能算法赛:智慧海洋建设的算法赛复赛赛道B top5的方案以及代码开源.比赛传送门:https://tianchi.aliyun.com ...

最新文章

  1. 网页中如何获取客户端系统已安装的所有字体?
  2. 智能安防“无芯片不AI”时代来临
  3. linux docker安装mysql_Linux-docker安装mysql
  4. Android的EditText自动获取焦点并弹出输入法问题
  5. java-1.11.0的环境配置,JAVA 环境配置
  6. 汇编:内存地址为什么从0开始?等问题
  7. 并发基础(七):Thread 类的sleep()、yeild()、join()
  8. 人工智能学习--文本检测实践
  9. 【Linux】一步一步学Linux——mktemp命令(263)
  10. Just $h$-index HDU - 6278(主席树找区间大于等于k的个数)
  11. 移动端 长按事件_Flutter事件监听
  12. 三、MySQL子查询学习笔记(标量子查询、列子查询、行子查询、表子查询 详解)
  13. 拆分SharePoint 2013 中CreatedModifiedInfo 的时间
  14. jenkins 管理员账号丢失
  15. 路飞学城Python-Day13
  16. Xcode8 最快最方便的安装插件方案
  17. 2020寒假【gmoj2008】【Oliver的成绩】【高精度】
  18. 经典:基因组测序数据从头拼接或组装算法的原理
  19. VMware虚拟机Ubuntu自适应屏幕大小
  20. 看着很滑稽,但现实又何尝不是这样呢?

热门文章

  1. 门窗软件测试自学,Revit 系列讲座第一季-路文虎
  2. Bye-- Czech golden generation
  3. cdo收取邮件_使用 CDO 发送测试电子邮件消息
  4. [QQ机器人]Nonebot2 今日运势插件
  5. Tensorflow + PyTorch 安装(CPU + GPU 版本)
  6. canvas 制作动画(下)
  7. GD32使用CRC的操作方法
  8. Javase基础(二)——数据类型
  9. 华为汽车战线的“多面手”
  10. C语言练习题,判断二维空间中的点,是否在圆内(输出:该点在圆内、该点在圆上、该点在圆外)