一句话说清

GBDT+LR，就是利用GBDT自动进行特征筛选和组合，进而形成新的离散特征向量，再把该特征向量当做LR模型的输入，来实现预测。

模型结构

GBDT思维导图

GBDT特征转换过程

举例来说，如下图，GBDT由三颗子树构成，每颗子树有4个叶子节点，输入一个训练样本后，其先后落入“子树1”的第三个叶子节点，那么特征向量就是[0, 0, 1, 0]。“子树2”的第一个叶子节点，特征向量为[1, 0, 0, 0]，“子树3”的第四个叶子节点，特征向量为[0, 0, 0, 1]，最后连接所有特征向量，形成最终特征向量[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1]，即有几棵树就有几个1.

GBDT+LR代码实现一

'''
@Time : 2021/1/15 11:32
@Author : WGS
@remarks : GBDT+LR
'''import numpy as npnp.random.seed(10)import matplotlib.pyplot as pltfrom sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline# 树的数量，默认10个
n_estimator = 10# 构造分类数据
X, y = make_classification(n_samples=80000)# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.5)grd = GradientBoostingClassifier(n_estimators=n_estimator)   # GBDT建模grd_enc = OneHotEncoder()   # onehotgrd_lm = LogisticRegression()   # LRgrd.fit(X_train, y_train)# grd.apply(X_train)[:, :, 0]  # 返回叶子索引   [:, :, 0]：原来是三维，变成2维
grd_enc.fit(grd.apply(X_train)[:, :, 0])grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr)y_pred_grd_lm = grd_lm.predict_proba(grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1]
fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm)y_pred_grd = grd.predict_proba(X_test)[:, 1]
fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd)plt.figure(2)
plt.xlim(0, 0.2)
plt.ylim(0.8, 1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_grd, tpr_grd, label='GBDT')
plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBDT + LR')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve (zoomed in at top left)')
plt.legend(loc='best')
plt.show()

GBDT+LR代码实现二

'''
@Time : 2021/1/15 11:32
@Author : WGS
@remarks : GBDT+LR
'''
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings('ignore')
import gc
from scipy import sparsedef preProcess():path = 'data/'print('读取数据...')df_train = pd.read_csv(path + 'train.csv')df_test = pd.read_csv(path + 'test.csv')print('读取结束')df_train.drop(['Id'], axis = 1, inplace = True)df_test.drop(['Id'], axis = 1, inplace = True)df_test['Label'] = -1data = pd.concat([df_train, df_test])data = data.fillna(-1)data.to_csv('data/data.csv', index = False)return datadef lr_predict(data, category_feature, continuous_feature): # 0.47181# 连续特征归一化print('开始归一化...')scaler = MinMaxScaler()for col in continuous_feature:data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))print('归一化结束')# 离散特征one-hot编码print('开始one-hot...')for col in category_feature:onehot_feats = pd.get_dummies(data[col], prefix = col)data.drop([col], axis = 1, inplace = True)data = pd.concat([data, onehot_feats], axis = 1)print('one-hot结束')train = data[data['Label'] != -1]target = train.pop('Label')test = data[data['Label'] == -1]test.drop(['Label'], axis = 1, inplace = True)# 划分数据集print('划分数据集...')x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)print('开始训练...')lr = LogisticRegression()lr.fit(x_train, y_train)tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])print('tr-logloss: ', tr_logloss)val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])print('val-logloss: ', val_logloss)print('开始预测...')y_pred = lr.predict_proba(test)[:, 1]print('写入结果...')res = pd.read_csv('data/test.csv')submission = pd.DataFrame({'Id': res['Id'], 'Label': y_pred})submission.to_csv('submission/submission_lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)print('结束')def gbdt_predict(data, category_feature, continuous_feature): # 0.44548# 离散特征one-hot编码print('开始one-hot...')for col in category_feature:onehot_feats = pd.get_dummies(data[col], prefix = col)data.drop([col], axis = 1, inplace = True)data = pd.concat([data, onehot_feats], axis = 1)print('one-hot结束')train = data[data['Label'] != -1]target = train.pop('Label')test = data[data['Label'] == -1]test.drop(['Label'], axis = 1, inplace = True)# 划分数据集print('划分数据集...')x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)print('开始训练..')gbm = lgb.LGBMClassifier(objective='binary',subsample= 0.8,min_child_weight= 0.5,colsample_bytree= 0.7,num_leaves=100,max_depth = 12,learning_rate=0.01,n_estimators=10000,)gbm.fit(x_train, y_train,eval_set = [(x_train, y_train), (x_val, y_val)],eval_names = ['train', 'val'],eval_metric = 'binary_logloss',early_stopping_rounds = 100,)tr_logloss = log_loss(y_train, gbm.predict_proba(x_train)[:, 1])val_logloss = log_loss(y_val, gbm.predict_proba(x_val)[:, 1])y_pred = gbm.predict_proba(test)[:, 1]print('写入结果...')res = pd.read_csv('data/test.csv')submission = pd.DataFrame({'Id': res['Id'], 'Label': y_pred})submission.to_csv('submission/submission_gbdt_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)print('结束')def gbdt_lr_predict(data, category_feature, continuous_feature): # 0.43616# 离散特征one-hot编码print('开始one-hot...')for col in category_feature:onehot_feats = pd.get_dummies(data[col], prefix = col)data.drop([col], axis = 1, inplace = True)data = pd.concat([data, onehot_feats], axis = 1)print('one-hot结束')train = data[data['Label'] != -1]target = train.pop('Label')test = data[data['Label'] == -1]test.drop(['Label'], axis = 1, inplace = True)# 划分数据集print('划分数据集...')x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)print('开始训练gbdt..')gbm = lgb.LGBMRegressor(objective='binary',subsample= 0.8,min_child_weight= 0.5,colsample_bytree= 0.7,num_leaves=100,max_depth = 12,learning_rate=0.05,n_estimators=10,)gbm.fit(x_train, y_train,eval_set = [(x_train, y_train), (x_val, y_val)],eval_names = ['train', 'val'],eval_metric = 'binary_logloss',# early_stopping_rounds = 100,)model = gbm.booster_print('训练得到叶子数')gbdt_feats_train = model.predict(train, pred_leaf = True)gbdt_feats_test = model.predict(test, pred_leaf = True)gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)print('构造新的数据集...')train = pd.concat([train, df_train_gbdt_feats], axis = 1)test = pd.concat([test, df_test_gbdt_feats], axis = 1)train_len = train.shape[0]data = pd.concat([train, test])del traindel testgc.collect()# # 连续特征归一化# print('开始归一化...')# scaler = MinMaxScaler()# for col in continuous_feature:#     data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))# print('归一化结束')# 叶子数one-hotprint('开始one-hot...')for col in gbdt_feats_name:print('this is feature:', col)onehot_feats = pd.get_dummies(data[col], prefix = col)data.drop([col], axis = 1, inplace = True)data = pd.concat([data, onehot_feats], axis = 1)print('one-hot结束')train = data[: train_len]test = data[train_len:]del datagc.collect()x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.3, random_state = 2018)# lrprint('开始训练lr..')lr = LogisticRegression()lr.fit(x_train, y_train)tr_logloss = log_loss(y_train, lr.predict_proba(x_train)[:, 1])print('tr-logloss: ', tr_logloss)val_logloss = log_loss(y_val, lr.predict_proba(x_val)[:, 1])print('val-logloss: ', val_logloss)print('开始预测...')y_pred = lr.predict_proba(test)[:, 1]print('写入结果...')res = pd.read_csv('data/test.csv')submission = pd.DataFrame({'Id': res['Id'], 'Label': y_pred})submission.to_csv('submission/submission_gbdt+lr_trlogloss_%s_vallogloss_%s.csv' % (tr_logloss, val_logloss), index = False)print('结束')def gbdt_ffm_predict(data, category_feature, continuous_feature):# 离散特征one-hot编码print('开始one-hot...')for col in category_feature:onehot_feats = pd.get_dummies(data[col], prefix = col)data = pd.concat([data, onehot_feats], axis = 1)print('one-hot结束')feats = [col for col in data if col not in category_feature] # onehot_feats + continuous_featuretmp = data[feats]train = tmp[tmp['Label'] != -1]target = train.pop('Label')test = tmp[tmp['Label'] == -1]test.drop(['Label'], axis = 1, inplace = True)# 划分数据集print('划分数据集...')x_train, x_val, y_train, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2018)print('开始训练gbdt..')gbm = lgb.LGBMRegressor(objective='binary',subsample= 0.8,min_child_weight= 0.5,colsample_bytree= 0.7,num_leaves=100,max_depth = 12,learning_rate=0.05,n_estimators=10,)gbm.fit(x_train, y_train,eval_set = [(x_train, y_train), (x_val, y_val)],eval_names = ['train', 'val'],eval_metric = 'binary_logloss',# early_stopping_rounds = 100,)model = gbm.booster_print('训练得到叶子数')gbdt_feats_train = model.predict(train, pred_leaf = True)gbdt_feats_test = model.predict(test, pred_leaf = True)gbdt_feats_name = ['gbdt_leaf_' + str(i) for i in range(gbdt_feats_train.shape[1])]df_train_gbdt_feats = pd.DataFrame(gbdt_feats_train, columns = gbdt_feats_name) df_test_gbdt_feats = pd.DataFrame(gbdt_feats_test, columns = gbdt_feats_name)print('构造新的数据集...')tmp = data[category_feature + continuous_feature + ['Label']]train = tmp[tmp['Label'] != -1]test = tmp[tmp['Label'] == -1]train = pd.concat([train, df_train_gbdt_feats], axis = 1)test = pd.concat([test, df_test_gbdt_feats], axis = 1)data = pd.concat([train, test])del traindel testgc.collect()# 连续特征归一化print('开始归一化...')scaler = MinMaxScaler()for col in continuous_feature:data[col] = scaler.fit_transform(data[col].values.reshape(-1, 1))print('归一化结束')data.to_csv('data/data.csv', index = False)return category_feature + gbdt_feats_namedef FFMFormat(df, label, path, train_len, category_feature = [], continuous_feature = []):index = df.shape[0]train = open(path + 'train.ffm', 'w')test = open(path + 'test.ffm', 'w')feature_index = 0feat_index = {}for i in range(index):feats = []field_index = 0for j, feat in enumerate(category_feature):t = feat + '_' + str(df[feat][i])if t not in  feat_index.keys():feat_index[t] = feature_indexfeature_index = feature_index + 1feats.append('%s:%s:%s' % (field_index, feat_index[t], 1))field_index = field_index + 1for j, feat in enumerate(continuous_feature):feats.append('%s:%s:%s' % (field_index, feature_index, df[feat][i]))feature_index = feature_index + 1field_index = field_index + 1print('%s %s' % (df[label][i], ' '.join(feats)))if i < train_len:train.write('%s %s\n' % (df[label][i], ' '.join(feats)))else:test.write('%s\n' % (' '.join(feats)))train.close()test.close()if __name__ == '__main__':data = preProcess()continuous_feature = ['I'] * 13continuous_feature = [col + str(i + 1) for i, col in enumerate(continuous_feature)] category_feature = ['C'] * 26category_feature = [col + str(i + 1) for i, col in enumerate(category_feature)] # lr_predict(data, category_feature, continuous_feature)# gbdt_predict(data, category_feature, continuous_feature)# gbdt_lr_predict(data, category_feature, continuous_feature)category_feature = gbdt_ffm_predict(data, category_feature, continuous_feature)data = pd.read_csv('data/data.csv')df_train = pd.read_csv('data/train.csv')FFMFormat(data, 'Label', 'data/', df_train.shape[0], category_feature, continuous_feature)

GBDT+LR 原理及代码实现相关推荐

推荐系统入门（五）：GBDT+LR（附代码）
推荐系统入门(五):GBDT+LR(附代码) 目录推荐系统入门(五):GBDT+LR(附代码) 引言 1. GBDT模型 2. LR模型 3. GBDT+LR模型 4. 编程实践实战思考参考资 ...
Facebook的GBDT+LR模型python代码实现
承接上篇讲解,本文代码,讲解看上篇目标:GBDT+LR模型步骤:GBDT+OneHot+LR 测试数据:iris 代码: 结果比较:与直接GBDT模型的比较目标:GBDT+LR模型实现GBDT ...
深度学习核心技术精讲100篇（二十）-如何通过树模型实现梯度提升树(GBDT)+LR,随机森林(RandomForest) +LR
前言在讲如何通过树模型做特征工程之前,首先让我们回顾一下一个机器学习(除去深度学习项目部分)项目的大致流程: 从业务场景中抽象出问题--分类问题,回归问题,还是聚类问题等, 接下来是数据获取,数据清 ...
广告点击率(CTR)预测经典模型 GBDT + LR 理解与实践（附数据 + 代码）
CTR 系列文章: 广告点击率(CTR)预测经典模型 GBDT + LR 理解与实践(附数据 + 代码) CTR经典模型串讲:FM / FFM / 双线性 FFM 相关推导与理解 CTR深度学习模型之 ...
GBDT+LR记录- 9.7代码训练GBDT与LR混合模型
GBDT+LR记录 9.7代码训练GBDT与LR混合模型在上一节课的train.py中,新建一个函数train_tree_and_lr_model def train_tree_and_lr_mod ...
推荐系统与深度学习（十四）——GBDT+LR模型原理
公众号后台回复"图书",了解更多号主新书内容作者:livan 来源:数据python与算法模型原理与GBDT+LR模型结缘是刚开始学习推荐系统的过程中,FaceBook一推出 ...
Facebook如何预测广告点击：剖析经典论文GBDT+LR
作者 | 梁唐来源 | TechFlow 今天我们来剖析一篇经典的论文:Practial Lessons from Predicting Clicks on Ads at Facebook.从这篇p ...
传统推荐算法Facebook的GBDT+LR模型深入理解
目标: 深入理解Facebook 2014年提出的的GBDT+LR模型. CSDN上泛泛而谈的文章很多,真正讲解透彻的没几篇.争取我这篇能讲解透彻. 今晚又想了许久,想通了一些原理.也分享出来. 算法 ...
AI上推荐之逻辑回归模型与GBDT+LR(特征工程模型化的开端)
1. 前言随着信息技术和互联网的发展, 我们已经步入了一个信息过载的时代,这个时代,无论是信息消费者还是信息生产者都遇到了很大的挑战: 信息消费者:如何从大量的信息中找到自己感兴趣的信息? 信息生产 ...

GBDT+LR 原理及代码实现

文章目录