import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe
from numpy.random import RandomState
from sklearn.metrics import mean_squared_error

数据读取

train = pd.read_csv("preprocess/train.csv")
test = pd.read_csv("preprocess/test.csv")

Wrapper特征筛选+LightGBM建模+TPE调优

  本次建模过程中将使用hyperopt优化器进行超参数搜索,hyperopt优化器也是贝叶斯优化器的一种,可以进行连续变量和离散变量的搜索,目前支持的搜索算法包括随机搜索(random search)、模拟退火(simulated annealing)和TPE(Tree of Parzen Estimator)算法,相比网格搜索,hyperopt效率更快、精度更高。其中hp是参数空间创建函数,fmin是参数搜索函数,tpe则是一种基于贝叶斯过程的搜索策略。

  同时,在本次建模中,我们也将采用wrapper方法进行特征筛选,即根据模型输出结果来进行特征筛选,由于很多时候相关系数并不能很好的衡量特征实际对于标签的重要性,因此wrapper筛选的特征往往更加有效。当然,如果希望我们特征筛选结果更加具有可信度,则可以配合交叉验证过程对其进行筛选。

Wrapper特征筛选

  接下来是特征筛选过程,此处先择使用Wrapper方法进行特征筛选,通过带入全部数据训练一个LightGBM模型,然后通过观察特征重要性,选取最重要的300个特征。当然,为了进一步确保挑选过程的有效性,此处我们考虑使用交叉验证的方法来进行多轮验证。实际多轮验证特征重要性的过程也较为清晰,我们只需要记录每一轮特征重要性,并在最后进行简单汇总即可。我们可以通过定义如下函数完成该过程:

def feature_select_wrapper(train, test):"""lgm特征重要性筛选函数:param train:训练数据集:param test:测试数据集:return:特征筛选后的训练集和测试集"""# Part 1.划分特征名称,删除ID列和标签列print('feature_select_wrapper...')label = 'target'features = train.columns.tolist()features.remove('card_id')features.remove('target')# Step 2.配置lgb参数# 模型参数params_initial = {'num_leaves': 31,'learning_rate': 0.1,'boosting': 'gbdt','min_child_samples': 20,'bagging_seed': 2020,'bagging_fraction': 0.7,'bagging_freq': 1,'feature_fraction': 0.7,'max_depth': -1,'metric': 'rmse','reg_alpha': 0,'reg_lambda': 1,'objective': 'regression'}# 控制参数# 提前验证迭代效果或停止ESR = 30# 迭代次数NBR = 10000# 打印间隔VBE = 50# Part 3.交叉验证过程# 实例化评估器kf = KFold(n_splits=5, random_state=2020, shuffle=True)# 创建空容器fse = pd.Series(0, index=features)for train_part_index, eval_index in kf.split(train[features], train[label]):# 封装训练数据集train_part = lgb.Dataset(train[features].loc[train_part_index],train[label].loc[train_part_index])# 封装验证数据集eval = lgb.Dataset(train[features].loc[eval_index],train[label].loc[eval_index])# 在训练集上进行训练,并同时进行验证bst = lgb.train(params_initial, train_part, num_boost_round=NBR,valid_sets=[train_part, eval],valid_names=['train', 'valid'],early_stopping_rounds=ESR, verbose_eval=VBE)# 输出特征重要性计算结果,并进行累加fse += pd.Series(bst.feature_importance(), features)# Part 4.选择最重要的300个特征feature_select = ['card_id'] + fse.sort_values(ascending=False).index.tolist()[:300]print('done')return train[feature_select + ['target']], test[feature_select]
train_LGBM, test_LGBM = feature_select_wrapper(train=train,test=test)

TPE参数优化

参数回调函数

  首先对于lgb模型来说,并不是所有的超参数都需要进行搜索,为了防止多次实例化模型过程中部分超参数被设置成默认参数,此处我们首先需要创建一个参数回调函数,用于在后续多次实例化模型过程中反复申明这部分参数的固定取值:

def params_append(params):"""动态回调参数函数,params视作字典:param params:lgb参数字典:return params:修正后的lgb参数字典"""params['feature_pre_filter'] = Falseparams['objective'] = 'regression'params['metric'] = 'rmse'params['bagging_seed'] = 2020return params

模型训练与参数优化函数

  接下来就是更加复杂的模型训练与超参数调优的的过程。不同于sklearn内部的调参过程,此处由于涉及多个不同的库相互协同,外加本身lgb模型参数就较为复杂,因此整体模型训练与优化过程较为复杂,我们可以通过下述函数来执行该过程:

def param_hyperopt(train):"""模型参数搜索与优化函数:param train:训练数据集:return params_best:lgb最优参数"""# Part 1.划分特征名称,删除ID列和标签列label = 'target'features = train.columns.tolist()features.remove('card_id')features.remove('target')# Part 2.封装训练数据train_data = lgb.Dataset(train[features], train[label])# Part 3.内部函数,输入模型超参数损失值输出函数def hyperopt_objective(params):"""输入超参数,输出对应损失值:param params::return:最小rmse"""# 创建参数集params = params_append(params)print(params)# 借助lgb的cv过程,输出某一组超参数下损失值的最小值res = lgb.cv(params, train_data, 1000,nfold=2,stratified=False,shuffle=True,metrics='rmse',early_stopping_rounds=20,verbose_eval=False,show_stdv=False,seed=2020)return min(res['rmse-mean']) # res是个字典# Part 4.lgb超参数空间params_space = {'learning_rate': hp.uniform('learning_rate', 1e-2, 5e-1),'bagging_fraction': hp.uniform('bagging_fraction', 0.5, 1),'feature_fraction': hp.uniform('feature_fraction', 0.5, 1),'num_leaves': hp.choice('num_leaves', list(range(10, 300, 10))),'reg_alpha': hp.randint('reg_alpha', 0, 10),'reg_lambda': hp.uniform('reg_lambda', 0, 10),'bagging_freq': hp.randint('bagging_freq', 1, 10),'min_child_samples': hp.choice('min_child_samples', list(range(1, 30, 5)))}# Part 5.TPE超参数搜索params_best = fmin(hyperopt_objective,space=params_space,algo=tpe.suggest,max_evals=30,rstate=np.random.default_rng(2020))# 返回最佳参数return params_best
best_clf = param_hyperopt(train_LGBM)
best_clf
{'bagging_fraction': 0.8198879482271282,'bagging_freq': 8,'feature_fraction': 0.5135156001738832,'learning_rate': 0.014657097603624963,'min_child_samples': 3,'num_leaves': 6,'reg_alpha': 7,'reg_lambda': 2.7194579245643924}

LightGBM模型训练

单模预测

# 再次申明固定参数
best_clf = params_append(best_clf)# 数据准备过程
label = 'target'
features = train_LGBM.columns.tolist()
features.remove('card_id')
features.remove('target')# 数据封装
lgb_train = lgb.Dataset(train_LGBM[features], train_LGBM[label])
# 在全部数据集上训练模型
bst = lgb.train(best_clf, lgb_train)
[LightGBM] [Warning] Auto-choosing col-wise multi-threading, the overhead of testing was 0.191293 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67206
[LightGBM] [Info] Number of data points in the train set: 201917, number of used features: 300
[LightGBM] [Info] Start training from score -0.393636
# 在测试集上完成预测
bst.predict(train_LGBM[features])
array([-0.22014219, -1.65431888,  0.03512283, ..., -0.19317691,-1.07427114, -0.21523434])
# 简单查看训练集RMSE
np.sqrt(mean_squared_error(train_LGBM[label], bst.predict(train_LGBM[features])))
3.7294731286175615
test_LGBM['target'] = bst.predict(test_LGBM[features])
test_LGBM[['card_id', 'target']].to_csv("result/submission_LGBM1.csv", index=False)
/home/DreamCode/anaconda3/envs/nlp/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value insteadSee the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy"""Entry point for launching an IPython kernel.
test_LGBM[['card_id', 'target']].head(5)
card_id target
0 C_ID_0ab67a22ab -2.180240
1 C_ID_130fd0cbdd -0.778037
2 C_ID_b709037bc5 -0.073457
3 C_ID_d27d835a9f -0.215132
4 C_ID_2b5e3df5c2 -0.294588

结合交叉验证

def train_predict(train, test, params):""":param train::param test::param params::return:"""# Part 1.选择特征label = 'target'features = train.columns.tolist()features.remove('card_id')features.remove('target')# Part 2.再次申明固定参数与控制迭代参数params = params_append(params)ESR = 30NBR = 10000VBE = 50# Part 3.创建结果存储容器# 测试集预测结果存储器,后保存至本地文件prediction_test = 0# 验证集的模型表现,作为展示用cv_score = []# 验证集的预测结果存储器,后保存至本地文件prediction_train = pd.Series()# Part 3.交叉验证kf = KFold(n_splits=5, random_state=2020, shuffle=True)for train_part_index, eval_index in kf.split(train[features], train[label]):# 训练数据封装train_part = lgb.Dataset(train[features].loc[train_part_index],train[label].loc[train_part_index])# 测试数据封装eval = lgb.Dataset(train[features].loc[eval_index],train[label].loc[eval_index])# 依据验证集训练模型bst = lgb.train(params, train_part, num_boost_round=NBR,valid_sets=[train_part, eval],valid_names=['train', 'valid'],early_stopping_rounds=ESR, verbose_eval=VBE)# 测试集预测结果并纳入prediction_test容器prediction_test += bst.predict(test[features])# 验证集预测结果并纳入prediction_train容器prediction_train = prediction_train.append(pd.Series(bst.predict(train[features].loc[eval_index]),index=eval_index))# 验证集预测结果eval_pre = bst.predict(train[features].loc[eval_index])# 计算验证集上得分score = np.sqrt(mean_squared_error(train[label].loc[eval_index].values, eval_pre))# 纳入cv_score容器cv_score.append(score)# Part 4.打印/输出结果# 打印验证集得分与平均得分print(cv_score, sum(cv_score) / 5)# 将验证集上预测结果写入本地文件pd.Series(prediction_train.sort_index().values).to_csv("preprocess/train_lightgbm.csv", index=False)# 将测试集上预测结果写入本地文件pd.Series(prediction_test / 5).to_csv("preprocess/test_lightgbm.csv", index=False)# 测试集平均得分作为模型最终预测结果test['target'] = prediction_test / 5# 将测试集预测结果写成竞赛要求格式并保存至本地test[['card_id', 'target']].to_csv("result/submission_lightgbm2.csv", index=False)return
train_predict(train_LGBM, test_LGBM, best_clf)

NLP特征优化+XGBoost建模+贝叶斯优化器

  在执行完随机森林与LightGBM后,我们已经对不同集成算法的竞赛建模流程有了一定的了解,大家可以照此思路继续尝试其他集成模型。当然,如果想进一步优化提升模型效果,我们可以考虑围绕数据集中的部分ID字段进行NLP特征优化。因此,接下来,我们考虑采用CountVectorizer, TfidfVectorizer两种方法对数据集中部分特征进行NLP特征衍生,并且采用XGBoost模型进行预测,同时考虑进一步使用另一种贝叶斯优化器(bayes_opt)来进行模型参数调优。

NLP特征优化

  首先我们注意到,在数据集中存在大量的ID相关的列(除了card_id外),包括’merchant_id’、‘merchant_category_id’、‘state_id’、‘subsector_id’、'city_id’等,考虑到这些ID在出现频率方面都和用户实际的交易行为息息相关,例如对于单独用户A来说,在其交易记录中频繁出现某商户id(假设为B),则说明该用户A对商户B情有独钟,而如果在不同的用户交易数据中,都频繁的出现了商户B,则说明这家商户受到广泛欢迎,而进一步的说明A的喜好可能和大多数用户一致,而反之则说明A用户的喜好较为独特。为了能够挖掘出类似信息,我们可以考虑采用NLP中CountVector和TF-IDF两种方法来进行进一步特征衍生,其中CountVector可以挖掘类似某用户钟爱某商铺的信息,而TF-IDF则可进一步挖掘出类似某用户的喜好是否普遍或一致等信息。

  此外,若要借助NLP方法进行进一步特征衍生,则需要考虑到新创建的特征数量过大所导致的问题,因此我们建议在使用上述方法的同时,考虑借助scipy中的稀疏矩阵相关方法,来进行新特征的存储与读取。即采用CSR格式来创建稀疏矩阵,用npz格式来进行本地数据文件保存。

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import gc

NLP特征创建

# 注意,该数据集是最初始的数据集
train = pd.read_csv('train.csv')
test =  pd.read_csv('test.csv')
merchant = pd.read_csv('merchants.csv')
new_transaction = pd.read_csv('new_merchant_transactions.csv')
history_transaction = pd.read_csv('historical_transactions.csv')
transaction = pd.concat([new_transaction, history_transaction], axis=0, ignore_index=True)
del new_transaction
del history_transaction
gc.collect()
9667
nlp_features = ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']for co in nlp_features:print(co)transaction[co] = transaction[co].astype(str)temp = transaction[transaction['month_lag']>=0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()temp.columns = ['card_id', co+'_new']train = pd.merge(train, temp, how='left', on='card_id')test = pd.merge(test, temp, how='left', on='card_id')temp = transaction[transaction['month_lag']<0].groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()temp.columns = ['card_id', co+'_hist']train = pd.merge(train, temp, how='left', on='card_id')test = pd.merge(test, temp, how='left', on='card_id')temp = transaction.groupby("card_id")[co].apply(list).apply(lambda x:' '.join(x)).reset_index()temp.columns = ['card_id', co+'_all']train = pd.merge(train, temp, how='left', on='card_id').fillna("-1")test = pd.merge(test, temp, how='left', on='card_id').fillna("-1")
merchant_id
merchant_category_id
state_id
subsector_id
city_id
# 创建空DataFrame用于保存NLP特征
train_x = pd.DataFrame()
test_x = pd.DataFrame()# 实例化CountVectorizer评估器与TfidfVectorizer评估器
cntv = CountVectorizer()
tfv = TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9, use_idf=1, smooth_idf=1, sublinear_tf=1)# 创建空列表用户保存修正后的列名称
vector_feature =[]
for co in ['merchant_id', 'merchant_category_id', 'state_id', 'subsector_id', 'city_id']:vector_feature.extend([co+'_new', co+'_hist', co+'_all'])# 提取每一列进行新特征衍生
for feature in vector_feature:print(feature)cntv.fit(train[feature].append(test[feature]))train_x = sparse.hstack((train_x, cntv.transform(train[feature]))).tocsr()test_x = sparse.hstack((test_x, cntv.transform(test[feature]))).tocsr()tfv.fit(train[feature].append(test[feature]))train_x = sparse.hstack((train_x, tfv.transform(train[feature]))).tocsr()test_x = sparse.hstack((test_x, tfv.transform(test[feature]))).tocsr()# 保存NLP特征衍生结果
sparse.save_npz("preprocess/train_nlp.npz", train_x)
sparse.save_npz("preprocess/test_nlp.npz", test_x)
merchant_id_new
merchant_id_hist
merchant_id_all
merchant_category_id_new
merchant_category_id_hist
merchant_category_id_all
state_id_new
state_id_hist
state_id_all
subsector_id_new
subsector_id_hist
subsector_id_all
city_id_new
city_id_hist
city_id_all
train_x.shape
(201917, 4235134)

XGBoost模型训练与优化

数据读取

%pip install bayesian-optimization
import xgboost as xgb
from sklearn.feature_selection import f_regression
from numpy.random import RandomState
from bayes_opt import BayesianOptimization
train = pd.read_csv('preprocess/train.csv')
test = pd.read_csv('preprocess/test.csv')
features = train.columns.tolist()
features.remove('card_id')
features.remove('target')train_x = sparse.load_npz("preprocess/train_nlp.npz")
test_x = sparse.load_npz("preprocess/test_nlp.npz")train_x = sparse.hstack((train_x, train[features])).tocsr()
test_x = sparse.hstack((test_x, test[features])).tocsr()

模型训练

  接下来进行模型训练,本轮训练的流程和lgb模型流程类似,首先需要创建用于重复申明固定参数的函数,然后定义搜索和优化函数,并在优化函数内部调用参数回调函数,最后定义模型预测函数,该函数将借助交叉验证过程来进行测试集预测,并同步创建验证集预测结果与每个模型对测试集的预测结果。

# 参数回调函数
def params_append(params):""":param params::return:"""params['objective'] = 'reg:squarederror'params['eval_metric'] = 'rmse'params["min_child_weight"] = int(params["min_child_weight"])params['max_depth'] = int(params['max_depth'])return params# 模型优化函数
def param_beyesian(train):""":param train::return:"""# Part 1.数据准备train_y = pd.read_csv("train.csv")['target']# 数据封装sample_index = train_y.sample(frac=0.1, random_state=2020).index.tolist()train_data = xgb.DMatrix(train.tocsr()[sample_index, : ], train_y.loc[sample_index].values, silent=True)# 借助cv过程构建目标函数def xgb_cv(colsample_bytree, subsample, min_child_weight, max_depth, reg_alpha, eta, reg_lambda):""":param colsample_bytree::param subsample::param min_child_weight::param max_depth::param reg_alpha::param eta::param reg_lambda::return:"""params = {'objective': 'reg:squarederror','early_stopping_round': 50,'eval_metric': 'rmse'}params['colsample_bytree'] = max(min(colsample_bytree, 1), 0)params['subsample'] = max(min(subsample, 1), 0)params["min_child_weight"] = int(min_child_weight)params['max_depth'] = int(max_depth)params['eta'] = float(eta)params['reg_alpha'] = max(reg_alpha, 0)params['reg_lambda'] = max(reg_lambda, 0)print(params)cv_result = xgb.cv(params, train_data,num_boost_round=1000,nfold=2, seed=2,stratified=False,shuffle=True,early_stopping_rounds=30,verbose_eval=False)return -min(cv_result['test-rmse-mean'])# 调用贝叶斯优化器进行模型优化xgb_bo = BayesianOptimization(xgb_cv,{'colsample_bytree': (0.5, 1),'subsample': (0.5, 1),'min_child_weight': (1, 30),'max_depth': (5, 12),'reg_alpha': (0, 5),'eta':(0.02, 0.2),'reg_lambda': (0, 5)})xgb_bo.maximize(init_points=21, n_iter=5)  # init_points表示初始点,n_iter代表迭代次数(即采样数)print(xgb_bo.max['target'], xgb_bo.max['params'])return xgb_bo.max['params']# 交叉验证预测函数
def train_predict(train, test, params):""":param train::param test::param params::return:"""train_y = pd.read_csv("train.csv")['target']test_data = xgb.DMatrix(test)params = params_append(params)kf = KFold(n_splits=5, random_state=2020, shuffle=True)prediction_test = 0cv_score = []prediction_train = pd.Series()ESR = 30NBR = 10000VBE = 50for train_part_index, eval_index in kf.split(train, train_y):# 模型训练train_part = xgb.DMatrix(train.tocsr()[train_part_index, :],train_y.loc[train_part_index])eval = xgb.DMatrix(train.tocsr()[eval_index, :],train_y.loc[eval_index])bst = xgb.train(params, train_part, NBR, [(train_part, 'train'),(eval, 'eval')], verbose_eval=VBE,maximize=False, early_stopping_rounds=ESR, )prediction_test += bst.predict(test_data)eval_pre = bst.predict(eval)prediction_train = prediction_train.append(pd.Series(eval_pre, index=eval_index))score = np.sqrt(mean_squared_error(train_y.loc[eval_index].values, eval_pre))cv_score.append(score)print(cv_score, sum(cv_score) / 5)pd.Series(prediction_train.sort_index().values).to_csv("preprocess/train_xgboost.csv", index=False)pd.Series(prediction_test / 5).to_csv("preprocess/test_xgboost.csv", index=False)test = pd.read_csv('test.csv')test['target'] = prediction_test / 5test[['card_id', 'target']].to_csv("result/submission_xgboost.csv", index=False)return
best_clf = param_beyesian(train_x)
train_predict(train_x, test_x, best_clf)

模型融合

  整体来看,常用模型融合的策略有两种,分别是Voting融合与Stacking融合,模型融合的目的和集成模型中的集成过程类似,都是希望能够尽可能借助不同模型的优势,最终输出一个更加可靠的结果。在Voting过程中,我们只需要对不同模型对测试集的预测结果进行加权汇总即可,而Stacking则相对复杂,需要借助此前不同模型的验证集预测结果和测试集预测结果再次进行模型训练,以验证集预测结果为训练集、训练集标签为标签构建新的训练集,在此模型上进行训练,然后以测试集预测结果作为新的预测集,并在新预测集上进行预测。

均值融合

  首先我们来看Voting融合过程。一般来说Voting融合也可以分为均值融合(多组预测结果求均值)、加权融合(根据某种方式赋予不同预测结果不同权重而后进行求和)以及Trick融合(根据某种特殊的规则赋予权重而后进行求和)三种,此处先介绍均值融合与加权融合的基本过程。

data = pd.read_csv("result/submission_randomforest.csv")
data['randomforest'] = data['target'].valuestemp = pd.read_csv("result/submission_lightgbm2.csv")
data['lightgbm'] = temp['target'].valuestemp = pd.read_csv("result/submission_xgboost.csv")
data['xgboost'] = temp['target'].valuesprint(data.corr())
                target  randomforest  lightgbm   xgboost
target        1.000000      1.000000  0.956675  0.943826
randomforest  1.000000      1.000000  0.956675  0.943826
lightgbm      0.956675      0.956675  1.000000  0.951885
xgboost       0.943826      0.943826  0.951885  1.000000
data['target'] = (data['randomforest'] + data['lightgbm'] + data['xgboost']) / 3
data[['card_id','target']].to_csv("result/voting_avr.csv", index=False)

加权融合

  加权融合的思路并不复杂,从客观计算流程上来看我们将赋予不同模型训练结果以不同权重,而具体权重的分配,我们可以根据三组模型在公榜上的评分决定,即假设模型A和B分别是2分和3分(分数越低越好的情况下),则在实际加权过程中,我们将赋予A模型结果3/5权重,B模型2/5权重,因此,加权融合过程如下:

data['target'] = data['randomforest']*0.2+data['lightgbm']*0.3 + data['xgboost']*0.5
data[['card_id','target']].to_csv("result/voting_wei1.csv", index=False)

Stacking融合

  此处我们考虑手动进行Stacking融合,在此前的模型训练中,我们已经创建了predication_train和predication_test数据集,这两个数据集将作为训练集、测试集带入到下一轮的建模中,而本轮建模也被称为Stacking融合。

数据校验

oof_rf  = pd.read_csv('./preprocess/train_randomforest.csv')
predictions_rf  = pd.read_csv('./preprocess/test_randomforest.csv')oof_lgb  = pd.read_csv('./preprocess/train_lightgbm.csv')
predictions_lgb  = pd.read_csv('./preprocess/test_lightgbm.csv')oof_xgb  = pd.read_csv('./preprocess/train_xgboost.csv')
predictions_xgb  = pd.read_csv('./preprocess/test_xgboost.csv')
predictions_rf.shape, predictions_lgb.shape
((123622, 1), (123622, 1))

模型建模

def stack_model(oof_1, oof_2, oof_3, predictions_1, predictions_2, predictions_3, y):# Part 1.数据准备# 按行拼接列,拼接验证集所有预测结果# train_stack就是final model的训练数据train_stack = np.hstack([oof_1, oof_2, oof_3])# 按行拼接列,拼接测试集上所有预测结果# test_stack就是final model的测试数据test_stack = np.hstack([predictions_1, predictions_2, predictions_3])# 创建一个和验证集行数相同的全零数组# oof = np.zeros(train_stack.shape[0])# 创建一个和测试集行数相同的全零数组predictions = np.zeros(test_stack.shape[0])# Part 2.多轮交叉验证from sklearn.model_selection import RepeatedKFoldfolds = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2020)# fold_为折数,trn_idx为每一折训练集index,val_idx为每一折验证集indexfor fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, y)):# 打印折数信息print("fold n°{}".format(fold_+1))# 训练集中划分为训练数据的特征和标签trn_data, trn_y = train_stack[trn_idx], y[trn_idx]# 训练集中划分为验证数据的特征和标签val_data, val_y = train_stack[val_idx], y[val_idx]# 开始训练时提示print("-" * 10 + "Stacking " + str(fold_+1) + "-" * 10)# 采用贝叶斯回归作为结果融合的模型(final model)clf = BayesianRidge()# 在训练数据上进行训练clf.fit(trn_data, trn_y)# 在验证数据上进行预测,并将结果记录在oof对应位置# oof[val_idx] = clf.predict(val_data)# 对测试集数据进行预测,每一轮预测结果占比额外的1/10predictions += clf.predict(test_stack) / (5 * 2)# 返回测试集的预测结果return predictions
target = pd.read_csv('train.csv')['target'].values
predictions_stack  = stack_model(oof_rf, oof_lgb, oof_xgb, predictions_rf, predictions_lgb, predictions_xgb, target)
predictions_stack
sub_df = pd.read_csv('data/sample_submission.csv')
sub_df["target"] = predictions_stack
sub_df.to_csv('predictions_stack1.csv', index=False)

集成学习与模型融合(kaggle-Elo Merchant Category Recommendation)相关推荐

  1. 集成学习与模型融合(kaggle--Elo Merchant Category Recommendation)

    import numpy as np import pandas as pd import lightgbm as lgb from sklearn.model_selection import KF ...

  2. 【机器学习】集成学习与模型融合方法举例

    [机器学习]集成学习与模型融合方法举例 文章目录 1 概述1.1 什么是集成学习 2 CrossValidation 交叉验证 3 stacking 4 Voting投票器 5 Bagging 1 概 ...

  3. 机器学习集成学习与模型融合

    来源:Datawhale 本文约5955字,建议阅读10分钟. 本文介绍在Kaggle或者阿里天池上面大杀四方的数据科学比赛利器---集成学习. 对比过kaggle比赛上面的top10的模型,除了深度 ...

  4. 机器学习集成学习与模型融合!

    ↑↑↑关注后"星标"Datawhale 每日干货 & 每月组队学习,不错过 Datawhale干货 作者:李祖贤,深圳大学,Datawhale高校群成员 对比过kaggle ...

  5. 【机器学习基础】浅析机器学习集成学习与模型融合

    作者:李祖贤,深圳大学,Datawhale高校群成员 对比过kaggle比赛上面的top10的模型,除了深度学习以外的模型基本上都是集成学习的产物.集成学习可谓是上分大杀器,今天就跟大家分享在Kagg ...

  6. 深度学习auc_机器学习集成学习与模型融合!

    ↑↑↑关注后"星标"Datawhale每日干货 & 每月组队学习,不错过Datawhale干货 作者:李祖贤,深圳大学,Datawhale高校群成员 对比过kaggle比赛 ...

  7. 《机器学习算法竞赛实战》整理 | 八、实战案例:Elo Merchant Category Recommendation

    详情请参见原书 ​​​​​<机器学习算法竞赛实战(图灵出品)>(王贺,刘鹏,钱乾)[摘要 书评 试读]- 京东图书 前言 比赛链接: https://www.kaggle.com/comp ...

  8. 1.11 集成算法|模型融合Ensemble methods

    sklearn.ensemble 模块包括了集成算法 集成算法的目的是用某个指定的学习算法将几个基学习器base estimator(一个模型称为一个基学习器)的预测结果结合起来,共同构建一个泛化性或 ...

  9. Kaggle-Elo Merchant Category Recommendation案例1%解决方案(特征工程)

    数据预处理流程 思路 1. 数据读取 import gc import time import numpy as np import pandas as pd from datetime import ...

最新文章

  1. 200余行代码,让你实时从视频中隐身
  2. 《深入理解计算机系统》读书笔记一
  3. 当人工智能遇见农业,农民伯伯不再「粒粒皆辛苦」
  4. py提取文字中的时间_怎样提取照片中的文字?照片文字提取软件推荐
  5. 如何分析SAPPSPRO-S_MAT_ENHANC_COMM问题
  6. 世界上最美的40个小镇,每一个都犹如仙境!
  7. HDU5126 stars(4维偏序->cdq套cdq+树状数组)
  8. 【小记录】关于dojo中的on事件
  9. HttpClient发送get post请求和数据解析
  10. MAC下载Android源码下载 Android10详解
  11. About Endian
  12. Silverlight游戏设计(Game Design):(六)场景编辑器之开源畅想
  13. 桌面被关闭,如何在任务管理器中打开桌面?
  14. sincerit 小乐乐切割方块(规律+思考)
  15. iMeta高被引论文|陈同/刘永鑫等高颜值绘图网站imageGP被引500次(截止22/12/13)
  16. 爬虫玩得好,牢饭吃到饱?这3条底线千万不能碰!
  17. zTree单击展开节点
  18. 破解苹果电脑开机密码
  19. Logstash 原理分析/配置文件详解 时间 日期 时区 ip 反斜杠 grok在线地址 类型转换
  20. 1103. Integer Factorization (30)

热门文章

  1. 记录一下自己做的leetcode题(卡牌分组)
  2. Python-环境搭建
  3. Qt OpenGL(二十)——Qt OpenGL 核心模式版本
  4. 输入一个整数,判断它是奇数还是偶数。 若n是奇数,则输出“This is an odd number.” 若n是偶数,则输出“This is a even number.”
  5. 树莓派【二、RGB_LED灯实验】
  6. 【Mac】Mac 访达 侧边栏 消失
  7. 在多边形内随机生成点
  8. 论文阅读 新的非视距场景(imaging behind occluders)|| ECCV 2020: Imaging Behind Occluders Using Two-Bounce Light
  9. 李林蔚:打造全球第一商用公链
  10. 2021全国职业技能大赛-网络安全赛题解析总结①(超详细)