导入包

# coding: utf-8
#导入包
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

读取训练集数据

#读取训练集
train_base = pd.read_csv('train/train_base.csv')
train_op = pd.read_csv('train/train_op.csv')
train_trans = pd.read_csv('train/train_trans.csv')
train_label = pd.read_csv('train/train_label.csv')

读取测试集数据

#读取测试集
test_base = pd.read_csv('test/test_a_base.csv')
test_op = pd.read_csv('test/test_a_op.csv')
test_trans = pd.read_csv('test/test_a_trans.csv')

我们发现训练集和测试集中sevice3_level字段缺失较多,因此删除此列。然后删除训练集中的sex列以及balance_avg列为空的行,得到不含缺失值的训练集。
对于测试集,其他列缺失值很少,直接用上一行的数据补全。

#去除训练集缺失值
train_df = train_df[train_df['sex'].notnull()]
train_df = train_df[train_df['balance_avg'].notnull()]
train_df.drop('service3_level',axis=1,inplace=True)
train_df.head()test_df.drop('service3_level',axis=1,inplace=True)
test_df.fillna(method='bfill',inplace=True)
test_df.info()

对于base数据集中的类别编码字段,采用label encode,对于有明显顺序关系和类别数目较少的特征,采用label编码;对于类别较多且没有顺序关系的特征,采取暂时舍弃特征的方式。

#label编码
train_df['sex'].loc[train_df['sex']=='category 1'] = 1
train_df['sex'].loc[train_df['sex']=='category 0'] = 0train_df['level'].loc[train_df['level']=='category 0'] = 0
train_df['level'].loc[train_df['level']=='category 1'] = 1
train_df['level'].loc[train_df['level']=='category 2'] = 2train_df['verified'].loc[train_df['verified']=='category 1'] = 1
train_df['verified'].loc[train_df['verified']=='category 0'] = 0train_df['agreement1'].loc[train_df['agreement1']=='category 1'] = 1
train_df['agreement1'].loc[train_df['agreement1']=='category 0'] = 0train_df['agreement2'].loc[train_df['agreement2']=='category 1'] = 1
train_df['agreement2'].loc[train_df['agreement2']=='category 0'] = 0train_df['agreement3'].loc[train_df['agreement3']=='category 1'] = 1
train_df['agreement3'].loc[train_df['agreement3']=='category 0'] = 0train_df['agreement4'].loc[train_df['agreement4']=='category 1'] = 1
train_df['agreement4'].loc[train_df['agreement4']=='category 0'] = 0train_df['balance'].loc[train_df['balance']=='level 0'] = 0
train_df['balance'].loc[train_df['balance']=='level 1'] = 1
train_df['balance'].loc[train_df['balance']=='level 2'] = 2
train_df['balance'].loc[train_df['balance']=='level 3'] = 3
train_df['balance'].loc[train_df['balance']=='level 4'] = 4
train_df['balance'].loc[train_df['balance']=='level 5'] = 5
train_df['balance'].loc[train_df['balance']=='level 6'] = 6
train_df['balance'].loc[train_df['balance']=='level 7'] = 7
train_df['balance'].loc[train_df['balance']=='level 8'] = 8
train_df['balance'].loc[train_df['balance']=='level 9'] = 9
train_df['balance'].loc[train_df['balance']=='level 10'] = 10
train_df['balance'].loc[train_df['balance']=='level 11'] = 11
train_df['balance'].loc[train_df['balance']=='level 12'] = 12
train_df['balance'].loc[train_df['balance']=='level 13'] = 13
train_df['balance'].loc[train_df['balance']=='level 14'] = 14
train_df['balance'].loc[train_df['balance']=='level 15'] = 15
train_df['balance'].loc[train_df['balance']=='level 16'] = 16
train_df['balance'].loc[train_df['balance']=='level 17'] = 17
train_df['balance'].loc[train_df['balance']=='level 18'] = 18
train_df['balance'].loc[train_df['balance']=='level 19'] = 19
train_df['balance'].loc[train_df['balance']=='level 20'] = 20
train_df['balance'].loc[train_df['balance']=='level 21'] = 21train_df['balance_avg'].loc[train_df['balance_avg']=='level 0'] = 0
train_df['balance_avg'].loc[train_df['balance_avg']=='level 1'] = 1
train_df['balance_avg'].loc[train_df['balance_avg']=='level 2'] = 2
train_df['balance_avg'].loc[train_df['balance_avg']=='level 3'] = 3
train_df['balance_avg'].loc[train_df['balance_avg']=='level 4'] = 4
train_df['balance_avg'].loc[train_df['balance_avg']=='level 5'] = 5
train_df['balance_avg'].loc[train_df['balance_avg']=='level 6'] = 6
train_df['balance_avg'].loc[train_df['balance_avg']=='level 7'] = 7
train_df['balance_avg'].loc[train_df['balance_avg']=='level 8'] = 8
train_df['balance_avg'].loc[train_df['balance_avg']=='level 9'] = 9
train_df['balance_avg'].loc[train_df['balance_avg']=='level 10'] = 10
train_df['balance_avg'].loc[train_df['balance_avg']=='level 11'] = 11
train_df['balance_avg'].loc[train_df['balance_avg']=='level 12'] = 12
train_df['balance_avg'].loc[train_df['balance_avg']=='level 13'] = 13
train_df['balance_avg'].loc[train_df['balance_avg']=='level 14'] = 14
train_df['balance_avg'].loc[train_df['balance_avg']=='level 15'] = 15
train_df['balance_avg'].loc[train_df['balance_avg']=='level 16'] = 16
train_df['balance_avg'].loc[train_df['balance_avg']=='level 17'] = 17
train_df['balance_avg'].loc[train_df['balance_avg']=='level 18'] = 18
train_df['balance_avg'].loc[train_df['balance_avg']=='level 19'] = 19
train_df['balance_avg'].loc[train_df['balance_avg']=='level 20'] = 20
train_df['balance_avg'].loc[train_df['balance_avg']=='level 21'] = 21train_df['balance1'].loc[train_df['balance1']=='level 0'] = 0
train_df['balance1'].loc[train_df['balance1']=='level 1'] = 1
train_df['balance1'].loc[train_df['balance1']=='level 2'] = 2
train_df['balance1'].loc[train_df['balance1']=='level 3'] = 3
train_df['balance1'].loc[train_df['balance1']=='level 4'] = 4
train_df['balance1'].loc[train_df['balance1']=='level 5'] = 5
train_df['balance1'].loc[train_df['balance1']=='level 6'] = 6
train_df['balance1'].loc[train_df['balance1']=='level 7'] = 7
train_df['balance1'].loc[train_df['balance1']=='level 8'] = 8
train_df['balance1'].loc[train_df['balance1']=='level 9'] = 9
train_df['balance1'].loc[train_df['balance1']=='level 10'] = 10
train_df['balance1'].loc[train_df['balance1']=='level 11'] = 11
train_df['balance1'].loc[train_df['balance1']=='level 12'] = 12
train_df['balance1'].loc[train_df['balance1']=='level 13'] = 13
train_df['balance1'].loc[train_df['balance1']=='level 14'] = 14
train_df['balance1'].loc[train_df['balance1']=='level 15'] = 15
train_df['balance1'].loc[train_df['balance1']=='level 16'] = 16
train_df['balance1'].loc[train_df['balance1']=='level 17'] = 17
train_df['balance1'].loc[train_df['balance1']=='level 18'] = 18
train_df['balance1'].loc[train_df['balance1']=='level 19'] = 19
train_df['balance1'].loc[train_df['balance1']=='level 20'] = 20
train_df['balance1'].loc[train_df['balance1']=='level 21'] = 21train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 0'] = 0
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 1'] = 1
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 2'] = 2
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 3'] = 3
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 4'] = 4
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 5'] = 5
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 6'] = 6
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 7'] = 7
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 8'] = 8
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 9'] = 9
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 10'] = 10
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 11'] = 11
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 12'] = 12
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 13'] = 13
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 14'] = 14
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 15'] = 15
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 16'] = 16
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 17'] = 17
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 18'] = 18
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 19'] = 19
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 20'] = 20
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 21'] = 21train_df['balance2'].loc[train_df['balance2']=='level 0'] = 0
train_df['balance2'].loc[train_df['balance2']=='level 1'] = 1
train_df['balance2'].loc[train_df['balance2']=='level 2'] = 2
train_df['balance2'].loc[train_df['balance2']=='level 3'] = 3
train_df['balance2'].loc[train_df['balance2']=='level 4'] = 4
train_df['balance2'].loc[train_df['balance2']=='level 5'] = 5
train_df['balance2'].loc[train_df['balance2']=='level 6'] = 6
train_df['balance2'].loc[train_df['balance2']=='level 7'] = 7
train_df['balance2'].loc[train_df['balance2']=='level 8'] = 8
train_df['balance2'].loc[train_df['balance2']=='level 9'] = 9
train_df['balance2'].loc[train_df['balance2']=='level 10'] = 10
train_df['balance2'].loc[train_df['balance2']=='level 11'] = 11
train_df['balance2'].loc[train_df['balance2']=='level 12'] = 12
train_df['balance2'].loc[train_df['balance2']=='level 13'] = 13
train_df['balance2'].loc[train_df['balance2']=='level 14'] = 14
train_df['balance2'].loc[train_df['balance2']=='level 15'] = 15
train_df['balance2'].loc[train_df['balance2']=='level 16'] = 16
train_df['balance2'].loc[train_df['balance2']=='level 17'] = 17
train_df['balance2'].loc[train_df['balance2']=='level 18'] = 18
train_df['balance2'].loc[train_df['balance2']=='level 19'] = 19
train_df['balance2'].loc[train_df['balance2']=='level 20'] = 20
train_df['balance2'].loc[train_df['balance2']=='level 21'] = 21train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 0'] = 0
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 1'] = 1
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 2'] = 2
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 3'] = 3
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 4'] = 4
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 5'] = 5train_df['service3'].loc[train_df['service3']=='category 1'] = 1
train_df['service3'].loc[train_df['service3']=='category 0'] = 0train_df['product1_amount'].loc[train_df['product1_amount']=='level 1'] = 1
train_df['product1_amount'].loc[train_df['product1_amount']=='level 2'] = 2
train_df['product1_amount'].loc[train_df['product1_amount']=='level 3'] = 3
train_df['product1_amount'].loc[train_df['product1_amount']=='level 4'] = 4
train_df['product1_amount'].loc[train_df['product1_amount']=='level 5'] = 5
train_df['product1_amount'].loc[train_df['product1_amount']=='level 6'] = 6
train_df['product1_amount'].loc[train_df['product1_amount']=='level 7'] = 7train_df['product2_amount'].loc[train_df['product2_amount']=='level 1'] = 1
train_df['product2_amount'].loc[train_df['product2_amount']=='level 2'] = 2
train_df['product2_amount'].loc[train_df['product2_amount']=='level 3'] = 3
train_df['product2_amount'].loc[train_df['product2_amount']=='level 4'] = 4
train_df['product2_amount'].loc[train_df['product2_amount']=='level 5'] = 5
train_df['product2_amount'].loc[train_df['product2_amount']=='level 6'] = 6
train_df['product2_amount'].loc[train_df['product2_amount']=='level 7'] = 7
train_df['product2_amount'].loc[train_df['product2_amount']=='level 8'] = 8
train_df['product2_amount'].loc[train_df['product2_amount']=='level 9'] = 9
train_df['product2_amount'].loc[train_df['product2_amount']=='level 10'] = 10
train_df['product2_amount'].loc[train_df['product2_amount']=='level 11'] = 11
train_df['product2_amount'].loc[train_df['product2_amount']=='level 12'] = 12
train_df['product2_amount'].loc[train_df['product2_amount']=='level 13'] = 13
train_df['product2_amount'].loc[train_df['product2_amount']=='level 14'] = 14
train_df['product2_amount'].loc[train_df['product2_amount']=='level 15'] = 15
train_df['product2_amount'].loc[train_df['product2_amount']=='level 16'] = 16
train_df['product2_amount'].loc[train_df['product2_amount']=='level 17'] = 17
train_df['product2_amount'].loc[train_df['product2_amount']=='level 18'] = 18
train_df['product2_amount'].loc[train_df['product2_amount']=='level 19'] = 19
train_df['product2_amount'].loc[train_df['product2_amount']=='level 20'] = 20
train_df['product2_amount'].loc[train_df['product2_amount']=='level 21'] = 21train_df['product3_amount'].loc[train_df['product3_amount']=='level 1'] = 1
train_df['product3_amount'].loc[train_df['product3_amount']=='level 2'] = 2
train_df['product3_amount'].loc[train_df['product3_amount']=='level 3'] = 3train_df['product4_amount'].loc[train_df['product4_amount']=='level 0'] = 0
train_df['product4_amount'].loc[train_df['product4_amount']=='level 1'] = 1train_df['product5_amount'].loc[train_df['product5_amount']=='level 0'] = 0
train_df['product5_amount'].loc[train_df['product5_amount']=='level 1'] = 1train_df['product6_amount'].loc[train_df['product6_amount']=='level 1'] = 1
train_df['product6_amount'].loc[train_df['product6_amount']=='level 2'] = 2
train_df['product6_amount'].loc[train_df['product6_amount']=='level 3'] = 3
train_df['product6_amount'].loc[train_df['product6_amount']=='level 4'] = 4
train_df['product6_amount'].loc[train_df['product6_amount']=='level 5'] = 5
train_df['product6_amount'].loc[train_df['product6_amount']=='level 6'] = 6
train_df['product6_amount'].loc[train_df['product6_amount']=='level 7'] = 7
train_df['product6_amount'].loc[train_df['product6_amount']=='level 8'] = 8
train_df['product6_amount'].loc[train_df['product6_amount']=='level 9'] = 9
train_df['product6_amount'].loc[train_df['product6_amount']=='level 10'] = 10
train_df['product6_amount'].loc[train_df['product6_amount']=='level 11'] = 11
train_df['product6_amount'].loc[train_df['product6_amount']=='level 12'] = 12
train_df['product6_amount'].loc[train_df['product6_amount']=='level 13'] = 13
train_df['product6_amount'].loc[train_df['product6_amount']=='level 14'] = 14
train_df['product6_amount'].loc[train_df['product6_amount']=='level 15'] = 15
train_df['product6_amount'].loc[train_df['product6_amount']=='level 16'] = 16
train_df['product6_amount'].loc[train_df['product6_amount']=='level 17'] = 17
train_df['product6_amount'].loc[train_df['product6_amount']=='level 18'] = 18
train_df['product6_amount'].loc[train_df['product6_amount']=='level 19'] = 19
train_df['product6_amount'].loc[train_df['product6_amount']=='level 20'] = 20
train_df['product6_amount'].loc[train_df['product6_amount']=='level 21'] = 21test_df['sex'].loc[test_df['sex']=='category 1'] = 1
test_df['sex'].loc[test_df['sex']=='category 0'] = 0test_df['level'].loc[test_df['level']=='category 0'] = 0
test_df['level'].loc[test_df['level']=='category 1'] = 1
test_df['level'].loc[test_df['level']=='category 2'] = 2test_df['verified'].loc[test_df['verified']=='category 1'] = 1
test_df['verified'].loc[test_df['verified']=='category 0'] = 0test_df['agreement1'].loc[test_df['agreement1']=='category 1'] = 1
test_df['agreement1'].loc[test_df['agreement1']=='category 0'] = 0test_df['agreement2'].loc[test_df['agreement2']=='category 1'] = 1
test_df['agreement2'].loc[test_df['agreement2']=='category 0'] = 0test_df['agreement3'].loc[test_df['agreement3']=='category 1'] = 1
test_df['agreement3'].loc[test_df['agreement3']=='category 0'] = 0test_df['agreement4'].loc[test_df['agreement4']=='category 1'] = 1
test_df['agreement4'].loc[test_df['agreement4']=='category 0'] = 0test_df['balance'].loc[test_df['balance']=='level 0'] = 0
test_df['balance'].loc[test_df['balance']=='level 1'] = 1
test_df['balance'].loc[test_df['balance']=='level 2'] = 2
test_df['balance'].loc[test_df['balance']=='level 3'] = 3
test_df['balance'].loc[test_df['balance']=='level 4'] = 4
test_df['balance'].loc[test_df['balance']=='level 5'] = 5
test_df['balance'].loc[test_df['balance']=='level 6'] = 6
test_df['balance'].loc[test_df['balance']=='level 7'] = 7
test_df['balance'].loc[test_df['balance']=='level 8'] = 8
test_df['balance'].loc[test_df['balance']=='level 9'] = 9
test_df['balance'].loc[test_df['balance']=='level 10'] = 10
test_df['balance'].loc[test_df['balance']=='level 11'] = 11
test_df['balance'].loc[test_df['balance']=='level 12'] = 12
test_df['balance'].loc[test_df['balance']=='level 13'] = 13
test_df['balance'].loc[test_df['balance']=='level 14'] = 14
test_df['balance'].loc[test_df['balance']=='level 15'] = 15
test_df['balance'].loc[test_df['balance']=='level 16'] = 16
test_df['balance'].loc[test_df['balance']=='level 17'] = 17
test_df['balance'].loc[test_df['balance']=='level 18'] = 18
test_df['balance'].loc[test_df['balance']=='level 19'] = 19
test_df['balance'].loc[test_df['balance']=='level 20'] = 20
test_df['balance'].loc[test_df['balance']=='level 21'] = 21test_df['balance_avg'].loc[test_df['balance_avg']=='level 0'] = 0
test_df['balance_avg'].loc[test_df['balance_avg']=='level 1'] = 1
test_df['balance_avg'].loc[test_df['balance_avg']=='level 2'] = 2
test_df['balance_avg'].loc[test_df['balance_avg']=='level 3'] = 3
test_df['balance_avg'].loc[test_df['balance_avg']=='level 4'] = 4
test_df['balance_avg'].loc[test_df['balance_avg']=='level 5'] = 5
test_df['balance_avg'].loc[test_df['balance_avg']=='level 6'] = 6
test_df['balance_avg'].loc[test_df['balance_avg']=='level 7'] = 7
test_df['balance_avg'].loc[test_df['balance_avg']=='level 8'] = 8
test_df['balance_avg'].loc[test_df['balance_avg']=='level 9'] = 9
test_df['balance_avg'].loc[test_df['balance_avg']=='level 10'] = 10
test_df['balance_avg'].loc[test_df['balance_avg']=='level 11'] = 11
test_df['balance_avg'].loc[test_df['balance_avg']=='level 12'] = 12
test_df['balance_avg'].loc[test_df['balance_avg']=='level 13'] = 13
test_df['balance_avg'].loc[test_df['balance_avg']=='level 14'] = 14
test_df['balance_avg'].loc[test_df['balance_avg']=='level 15'] = 15
test_df['balance_avg'].loc[test_df['balance_avg']=='level 16'] = 16
test_df['balance_avg'].loc[test_df['balance_avg']=='level 17'] = 17
test_df['balance_avg'].loc[test_df['balance_avg']=='level 18'] = 18
test_df['balance_avg'].loc[test_df['balance_avg']=='level 19'] = 19
test_df['balance_avg'].loc[test_df['balance_avg']=='level 20'] = 20
test_df['balance_avg'].loc[test_df['balance_avg']=='level 21'] = 21test_df['balance1'].loc[test_df['balance1']=='level 0'] = 0
test_df['balance1'].loc[test_df['balance1']=='level 1'] = 1
test_df['balance1'].loc[test_df['balance1']=='level 2'] = 2
test_df['balance1'].loc[test_df['balance1']=='level 3'] = 3
test_df['balance1'].loc[test_df['balance1']=='level 4'] = 4
test_df['balance1'].loc[test_df['balance1']=='level 5'] = 5
test_df['balance1'].loc[test_df['balance1']=='level 6'] = 6
test_df['balance1'].loc[test_df['balance1']=='level 7'] = 7
test_df['balance1'].loc[test_df['balance1']=='level 8'] = 8
test_df['balance1'].loc[test_df['balance1']=='level 9'] = 9
test_df['balance1'].loc[test_df['balance1']=='level 10'] = 10
test_df['balance1'].loc[test_df['balance1']=='level 11'] = 11
test_df['balance1'].loc[test_df['balance1']=='level 12'] = 12
test_df['balance1'].loc[test_df['balance1']=='level 13'] = 13test_df['balance1'].loc[test_df['balance1']=='level 14'] = 14
test_df['balance1'].loc[test_df['balance1']=='level 15'] = 15
test_df['balance1'].loc[test_df['balance1']=='level 16'] = 16
test_df['balance1'].loc[test_df['balance1']=='level 17'] = 17
test_df['balance1'].loc[test_df['balance1']=='level 18'] = 18
test_df['balance1'].loc[test_df['balance1']=='level 19'] = 19
test_df['balance1'].loc[test_df['balance1']=='level 20'] = 20
test_df['balance1'].loc[test_df['balance1']=='level 21'] = 21test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 0'] = 0
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 1'] = 1
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 2'] = 2
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 3'] = 3
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 4'] = 4
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 5'] = 5
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 6'] = 6
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 7'] = 7
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 8'] = 8
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 9'] = 9
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 10'] = 10
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 11'] = 11
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 12'] = 12
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 13'] = 13
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 14'] = 14
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 15'] = 15
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 16'] = 16
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 17'] = 17
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 18'] = 18
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 19'] = 19
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 20'] = 20
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 21'] = 21test_df['balance2'].loc[test_df['balance2']=='level 0'] = 0
test_df['balance2'].loc[test_df['balance2']=='level 1'] = 1
test_df['balance2'].loc[test_df['balance2']=='level 2'] = 2
test_df['balance2'].loc[test_df['balance2']=='level 3'] = 3
test_df['balance2'].loc[test_df['balance2']=='level 4'] = 4
test_df['balance2'].loc[test_df['balance2']=='level 5'] = 5
test_df['balance2'].loc[test_df['balance2']=='level 6'] = 6
test_df['balance2'].loc[test_df['balance2']=='level 7'] = 7
test_df['balance2'].loc[test_df['balance2']=='level 8'] = 8
test_df['balance2'].loc[test_df['balance2']=='level 9'] = 9
test_df['balance2'].loc[test_df['balance2']=='level 10'] = 10
test_df['balance2'].loc[test_df['balance2']=='level 11'] = 11
test_df['balance2'].loc[test_df['balance2']=='level 12'] = 12
test_df['balance2'].loc[test_df['balance2']=='level 13'] = 13
test_df['balance2'].loc[test_df['balance2']=='level 14'] = 14
test_df['balance2'].loc[test_df['balance2']=='level 15'] = 15
test_df['balance2'].loc[test_df['balance2']=='level 16'] = 16
test_df['balance2'].loc[test_df['balance2']=='level 17'] = 17
test_df['balance2'].loc[test_df['balance2']=='level 18'] = 18
test_df['balance2'].loc[test_df['balance2']=='level 19'] = 19
test_df['balance2'].loc[test_df['balance2']=='level 20'] = 20
test_df['balance2'].loc[test_df['balance2']=='level 21'] = 21test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 0'] = 0
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 1'] = 1
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 2'] = 2
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 3'] = 3
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 4'] = 4
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 5'] = 5test_df['service3'].loc[test_df['service3']=='category 1'] = 1
test_df['service3'].loc[test_df['service3']=='category 0'] = 0
test_df
test_df['product1_amount'].loc[test_df['product1_amount']=='level 1'] = 1
test_df['product1_amount'].loc[test_df['product1_amount']=='level 2'] = 2
test_df['product1_amount'].loc[test_df['product1_amount']=='level 3'] = 3
test_df['product1_amount'].loc[test_df['product1_amount']=='level 4'] = 4
test_df['product1_amount'].loc[test_df['product1_amount']=='level 5'] = 5
test_df['product1_amount'].loc[test_df['product1_amount']=='level 6'] = 6
test_df['product1_amount'].loc[test_df['product1_amount']=='level 7'] = 7test_df['product2_amount'].loc[test_df['product2_amount']=='level 1'] = 1
test_df['product2_amount'].loc[test_df['product2_amount']=='level 2'] = 2
test_df['product2_amount'].loc[test_df['product2_amount']=='level 3'] = 3
test_df['product2_amount'].loc[test_df['product2_amount']=='level 4'] = 4
test_df['product2_amount'].loc[test_df['product2_amount']=='level 5'] = 5
test_df['product2_amount'].loc[test_df['product2_amount']=='level 6'] = 6
test_df['product2_amount'].loc[test_df['product2_amount']=='level 7'] = 7
test_df['product2_amount'].loc[test_df['product2_amount']=='level 8'] = 8
test_df['product2_amount'].loc[test_df['product2_amount']=='level 9'] = 9
test_df['product2_amount'].loc[test_df['product2_amount']=='level 10'] = 10
test_df['product2_amount'].loc[test_df['product2_amount']=='level 11'] = 11
test_df['product2_amount'].loc[test_df['product2_amount']=='level 12'] = 12
test_df['product2_amount'].loc[test_df['product2_amount']=='level 13'] = 13
test_df['product2_amount'].loc[test_df['product2_amount']=='level 14'] = 14
test_df['product2_amount'].loc[test_df['product2_amount']=='level 15'] = 15
test_df['product2_amount'].loc[test_df['product2_amount']=='level 16'] = 16
test_df['product2_amount'].loc[test_df['product2_amount']=='level 17'] = 17
test_df['product2_amount'].loc[test_df['product2_amount']=='level 18'] = 18
test_df['product2_amount'].loc[test_df['product2_amount']=='level 19'] = 19
test_df['product2_amount'].loc[test_df['product2_amount']=='level 20'] = 20
test_df['product2_amount'].loc[test_df['product2_amount']=='level 21'] = 21test_df['product3_amount'].loc[test_df['product3_amount']=='level 1'] = 1
test_df['product3_amount'].loc[test_df['product3_amount']=='level 2'] = 2
test_df['product3_amount'].loc[test_df['product3_amount']=='level 3'] = 3test_df['product4_amount'].loc[test_df['product4_amount']=='level 0'] = 0
test_df['product4_amount'].loc[test_df['product4_amount']=='level 1'] = 1test_df['product5_amount'].loc[test_df['product5_amount']=='level 0'] = 0
test_df['product5_amount'].loc[test_df['product5_amount']=='level 1'] = 1test_df['product6_amount'].loc[test_df['product6_amount']=='level 1'] = 1
test_df['product6_amount'].loc[test_df['product6_amount']=='level 2'] = 2
test_df['product6_amount'].loc[test_df['product6_amount']=='level 3'] = 3
test_df['product6_amount'].loc[test_df['product6_amount']=='level 4'] = 4
test_df['product6_amount'].loc[test_df['product6_amount']=='level 5'] = 5
test_df['product6_amount'].loc[test_df['product6_amount']=='level 6'] = 6
test_df['product6_amount'].loc[test_df['product6_amount']=='level 7'] = 7
test_df['product6_amount'].loc[test_df['product6_amount']=='level 8'] = 8
test_df['product6_amount'].loc[test_df['product6_amount']=='level 9'] = 9
test_df['product6_amount'].loc[test_df['product6_amount']=='level 10'] = 10
test_df['product6_amount'].loc[test_df['product6_amount']=='level 11'] = 11
test_df['product6_amount'].loc[test_df['product6_amount']=='level 12'] = 12
test_df['product6_amount'].loc[test_df['product6_amount']=='level 13'] = 13
test_df['product6_amount'].loc[test_df['product6_amount']=='level 14'] = 14
test_df['product6_amount'].loc[test_df['product6_amount']=='level 15'] = 15
test_df['product6_amount'].loc[test_df['product6_amount']=='level 16'] = 16
test_df['product6_amount'].loc[test_df['product6_amount']=='level 17'] = 17
test_df['product6_amount'].loc[test_df['product6_amount']=='level 18'] = 18
test_df['product6_amount'].loc[test_df['product6_amount']=='level 19'] = 19
test_df['product6_amount'].loc[test_df['product6_amount']=='level 20'] = 20
test_df['product6_amount'].loc[test_df['product6_amount']=='level 21'] = 21

删除类别较多的特征

train_df.drop(['province','provider','city','regist_type'],axis=1,inplace=True)
test_df.drop(['province','provider','city','regist_type'],axis=1,inplace=True)

更改数据类型

#改变数据类型
features = features = train_df[:1].drop('user',axis=1).columns
for i in features:if train_df[i].dtypes == object:train_df[i] = train_df[i].astype(int)
#改变数据类型
features = features = test_df[:1].drop('user',axis=1).columns
for i in features:if test_df[i].dtypes == object:test_df[i] = test_df[i].astype(int)

训练模型

#对模型进行训练并提交
drop_columns=["user","label"]
clf = lgb
train_count = train_df.shape[0]
#train_df = data_df[:train_count].copy().reset_index(drop=True)
#test_df = data_df[train_count:].copy().reset_index(drop=True)features = train_df[:1].drop(drop_columns,axis=1).columns
train_x = train_df[features]
test_x = test_df[features]
test_x = test_x.fillna(method='ffill')
train_y = train_df['label']train = np.zeros((train_x.shape[0], 1))
test = np.zeros((test_x.shape[0], 1))nums = int(train_x.shape[0] * 0.90)trn_x, trn_y, val_x, val_y = train_x[:nums], train_y[:nums], train_x[nums:], train_y[nums:]train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
data_matrix  = clf.Dataset(train_x, label=train_y)params = {'boosting_type': 'gbdt','metric': {'binary_logloss', 'auc'},'min_child_weight': 5,'num_leaves': 2**6 ,#64'objective': 'binary','feature_fraction': 0.9,'bagging_fraction': 0.7,'bagging_freq': 1,'learning_rate': 0.01,'seed': 520,'min_data_in_leaf': 500,}model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=1000)
model2 = clf.train(params, data_matrix, model.best_iteration)
val_pred = model.predict(val_x, num_iteration=model2.best_iteration).reshape(-1,1)
test_pred = model.predict(test_x, num_iteration=model2.best_iteration).reshape(-1,1)lgb_train, lgb_test = val_pred,test_pred

提交文件

sub = pd.DataFrame()
sub['user'] = test_df.user
sub['prob'] = test_pred[:,0]
sub = sub.sort_values('user')
sub.to_csv('sub.csv',index=False)

本baseline只是简单的对数据进行了编码,还没有构造其他特征以及数据预处理等。

第二届翼支付杯大数据建模大赛-信用风险用户识别Baseline 线上0.65+稳进复赛相关推荐

  1. 三城演义!第二届中国移动“梧桐杯”大数据应用创新大赛复赛完美收官

    8月19日至26日,第二届中国移动"梧桐杯"大数据应用创新大赛暨大数据创客马拉松大赛的三场复赛路演暨颁奖典礼在杭州.武汉.广州三地成功举办.35支队伍在数智乡村.数智城市.数智交通 ...

  2. 报名开始!第二届中国移动“梧桐杯”大数据应用创新大赛邀你夺52w大奖!

    为进一步落实中国移动战略,助力公司数字化转型发展,推动高校人才创新培养,由中国移动通信集团有限公司主办,中国移动通信集团北京有限公司.中国移动通信集团湖北有限公司.中国移动通信集团广东有限公司.中国移 ...

  3. 【报名开始】第二届中国移动“梧桐杯”大数据应用创新大赛邀你夺52w大奖

    为进一步落实中国移动战略,助力公司数字化转型发展,推动高校人才创新培养,由中国移动通信集团有限公司主办,中国移动通信集团北京有限公司.中国移动通信集团湖北有限公司.中国移动通信集团广东有限公司.中国移 ...

  4. 2017第二届中国信息通信大数据大会将于6月在京召开

    随着"国家大数据战略"落实实施,<信息通信行业发展规划(2016-2020年)>发布,信息通信业以大数据为基础,从传统电信服务向互联网服务延伸,深入推进与经济社会各领域 ...

  5. 【好消息】高录用、EI检索会议 | 2023年第二届电子信息工程、大数据与计算机技术国际学术会议(EIBDCT 2023)

    [重要信息] 大会官网:www.eibdct.net 中文网站:https://www.ais.cn/attendees/index/MY7VVM 大会时间:2023年01月06-08日 大会地点:中 ...

  6. 第二届云计算大会暨大数据高峰论坛举办

    本文讲的是第二届云计算大会暨大数据高峰论坛举办,2012年9月20日,北京.由IT商业新闻网.全国CIO/CTO俱乐部主办,<IT时代周刊>协办的"智在应用--第二届云计算大会暨 ...

  7. 【2016年第6期】情境大数据建模及其在用户行为预测中的应用

    吴书,刘强,王亮 中国科学院自动化研究所智能感知与计算研究中心,北京 100190 摘要:随着大数据时代的到来,信息系统收集了海量情境信息,如舆情信息.环境信息.经济信息等.这些情景大数据提供丰富的细 ...

  8. 天池广东工业智造大数据创新大赛--铝型材表面瑕疵识别 --top1方案

    天池广东工业智造大数据创新大赛--铝型材表面瑕疵识别 --top1方案 共同启动"广东工业智造大数据创新大赛",旨在通过数据开放召集全球众智,将重点围绕工业制造大数据展开,以应用为 ...

  9. 前海征信“好信杯”大数据算法大赛——入门篇笔记

    ctr+4/5注释 数据下载地址 1.先导入包: import pandas as pd import numpy as np import seaborn as sns import matplot ...

  10. 又一数据挖掘赛事,在校生专属,翼支付杯来了(直通实习机会)

    Datawhale 主办方:中国电信-翼支付,数据挖掘赛事 为了积极研究探索"金融科技FinTech"技术并努力应用到实际业务中,挖掘更多金融科技在实际普惠金融业务的应用方案.由翼 ...

最新文章

  1. LaTex中编辑公式的上下角标出现的问题---允许{}的嵌套吗?
  2. CGAffineTransform与CATransform3D
  3. 游戏外挂技术:编程实现内存检索(检索内存中指定数据)
  4. shell多行匹配如何实现
  5. Excel文件弹出另存为代码
  6. java安卓写文件路径,如何使用gradle作为构建系统,平台Android配置Protobuf(Java)文件的输出路径?...
  7. 傅里叶变换 【完整版】
  8. 数据流中的中位数 c语言,41 数据流中的中位数(时间效率)
  9. 为什么 mysql 里的 ibdata1 文件不断的增长?
  10. 三菱PLC编程软件:GX WORKS2和GX WORKS3的区别
  11. HCIE Security AC的准入技术 备考笔记(幕布)
  12. 中国工业管理软件如何突围?
  13. ref获取元素 vue 删除子元素_vue 添加删除子元素
  14. 如何在excel中输入身份证号
  15. 静态网页设计——春节
  16. linux mint 搜狗 乱码,解决linux mint wine微信字体显示问题
  17. isNaN()和isFinite()的应用
  18. 怎么让游戏强制窗口化_游戏防上瘾,趁父母午睡时拍照片让你玩不成!
  19. 计算机科学个人陈述中文,计算机专业个人陈述范文
  20. minor.major version 详解

热门文章

  1. egret给对象涂颜色
  2. 自动控制原理->控制系统性能
  3. 授课型英硕申请Ph.D (带奖)历程
  4. Vue--搭建个人博客
  5. Matlab绘制树形图
  6. 为什么现在很多人不看好商汤科技?
  7. Android 多语言应用(步骤+源码)
  8. linux设置mysql开机启动
  9. java 将海外时区转换为北京时区
  10. 华为鸿蒙星星之火,星星之火 数码视讯支持华为鸿蒙发布会全球直播