20200317_决策树预测贷款申请

使用决策树，预测贷款申请

import pandas as pd
#  忽略弹出的warnings
import warnings
warnings.filterwarnings('ignore')

text=pd.read_excel('data/LoanStats_securev1_2019Q4.xlsx')
text.head()

	id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	emp_title	...	num_tl_op_past_12m	pct_tl_nvr_dlq	percent_bc_gt_75	pub_rec_bankruptcies	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit
0	164027473	20000	20000	20000	36 months	0.1240	668.12	B	B4	NaN	...	2	100.0	50.0	1	60800	42566	5200	40000.0
1	163984413	16500	16500	16500	60 months	0.1033	353.27	B	B1	NaN	...	0	100.0	0.0	0	223390	40913	40500	39890.0
2	164193225	7500	7500	7500	36 months	0.1240	250.55	B	B4	Rn	...	7	54.5	16.7	0	138468	102122	47700	90768.0
3	162948736	19000	19000	18975	36 months	0.0646	581.99	A	A1	Tech Ops Analyst	...	0	100.0	40.0	0	184034	28461	38400	35000.0
4	164161686	10000	10000	10000	36 months	0.2055	374.45	D	D2	Planner	...	2	100.0	16.7	0	639373	161516	24600	172818.0

5 rows × 114 columns

目标变量

text['loan_status'].value_counts()

Current               122625
Fully Paid              3539
In Grace Period         1079
Late (31-120 days)       509
Late (16-30 days)        304
Charged Off               80
n                          1
Name: loan_status, dtype: int64

#0为已经完成的
def function(x):if 'Current' in x:return 0elif 'Fully Paid' in x:return 0else:return 1
text['loan_status']=text.apply(lambda x:function(x['loan_status']),axis=1)

text['loan_status'].value_counts()

0    126164
1      1973
Name: loan_status, dtype: int64

pos_trainDf = text[text['loan_status'] == 1]
neg_trainDf = text[text['loan_status'] == 0].sample(n=4000, random_state=2018)
text = pd.concat([pos_trainDf, neg_trainDf], axis=0).sample(frac=1.0,random_state=2018)

text.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 18821 to 92872
Columns: 114 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(36), int64(50), object(27)
memory usage: 5.2+ MB

缺失值查看

check_null = text.isnull().sum(axis=0).sort_values(ascending=False)/float(len(text)) #查看缺失值比例
print(check_null[check_null >0.2]) # 查看缺失比例大于20%的属性。

desc                              0.999833
mths_since_last_record            0.899046
verification_status_joint         0.880629
annual_inc_joint                  0.864055
dti_joint                         0.864055
mths_since_recent_bc_dlq          0.794408
mths_since_last_major_derog       0.769965
mths_since_recent_revol_delinq    0.703164
mths_since_last_delinq            0.548468
dtype: float64

thresh_count = len(text)*0.4 # 设定阀值
data = text.dropna(thresh=thresh_count, axis=1 ) #若某一列数据缺失的数量超过阀值就会被删除

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5973 entries, 18821 to 92872
Columns: 106 entries, id to total_il_high_credit_limit
dtypes: datetime64[ns](1), float64(30), int64(50), object(25)
memory usage: 4.9+ MB

删除无意义的列

sub_grade：与Grade的信息重复

emp_title ：缺失值较多，同时不能反映借款人收入或资产的真实情况

zip_code：地址邮编，邮编显示不全，没有意义

addr_state：申请地址所属州，不能反映借款人的偿债能力

last_credit_pull_d ：LendingClub平台最近一个提供贷款的时间，没有意义

policy_code ：变量信息全为1

pymnt_plan 基本是n

title： title与purpose的信息重复，同时title的分类信息更加离散

next_pymnt_d : 下一个付款时间，没有意义

policy_code : 没有意义

collection_recovery_fee: 全为0，没有意义

earliest_cr_line : 记录的是借款人发生第一笔借款的时间

issue_d ：贷款发行时间，这里提前向模型泄露了信息

last_pymnt_d、collection_recovery_fee、last_pymnt_amnt：预测贷款违约模型是贷款前的风险控制手段，这些贷后信息都会影响我们训练模型的效果，在此将这些信息删除

drop_list = ['sub_grade', 'emp_title',  'title', 'zip_code', 'addr_state', 'mths_since_last_delinq' ,'initial_list_status','title','issue_d','last_pymnt_d','last_pymnt_amnt','next_pymnt_d','last_credit_pull_d','policy_code','collection_recovery_fee', 'earliest_cr_line']
data.drop(drop_list, axis=1, inplace = True)

data.head()

	id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	emp_length	home_ownership	...	num_tl_op_past_12m	pct_tl_nvr_dlq	percent_bc_gt_75	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit
18821	163425898	4500	4500	4500	36 months	0.1612	158.48	C	NaN	RENT	...	2	100.0	28.6	44700	10872	32800	0.0
61234	161908366	20000	20000	20000	60 months	0.2305	564.39	D	NaN	OWN	...	0	100.0	33.3	54349	19572	10400	22349.0
119781	159901427	10000	10000	10000	60 months	0.1862	257.32	D	6 years	OWN	...	3	100.0	0.0	69077	48184	9600	49477.0
49201	162292591	21000	21000	21000	60 months	0.1430	491.91	C	< 1 year	RENT	...	0	100.0	0.0	109894	66662	33800	67194.0
53727	162154208	40000	40000	40000	60 months	0.0819	814.70	A	10+ years	RENT	...	0	100.0	50.0	207370	160985	98000	61725.0

5 rows × 91 columns

分类变量

objectColumns = data.select_dtypes(include=["object"]).columns
data[objectColumns].isnull().sum().sort_values(ascending=False)

emp_length             572
application_type         1
url                      1
total_acc                0
delinq_2yrs              0
purpose                  0
pymnt_plan               0
verification_status      0
annual_inc               0
home_ownership           0
grade                    0
term                     0
dtype: int64

# data['int_rate'] = data['int_rate'].str.rstrip('%').astype('float')
# data['revol_util'] = data['revol_util'].str.rstrip('%').astype('float')
# data['annual_inc'] = data['annual_inc'].str.replace(",","").astype('float')

import numpy as np
objectColumns = data.select_dtypes(include=["object"]).columns # 筛选数据类型为object的数据
data[objectColumns] = data[objectColumns].fillna("Unknown") #以分类“Unknown”填充缺失值

import missingno as msno
import matplotlib as mpl
mpl.rcParams['font.sans-serif']=[u'simHei']
mpl.rcParams['axes.unicode_minus']=False
%matplotlib inline
msno.bar(data[objectColumns]) #可视化

<matplotlib.axes._subplots.AxesSubplot at 0x2cacc08aa20>

mapping_dict = {"emp_length": {"10+ years": 10,"9 years": 9,"8 years": 8,"7 years": 7,"6 years": 6,"5 years": 5,"4 years": 4,"3 years": 3,"2 years": 2,"1 year": 1,"< 1 year": 0,"n/a": 0},"grade":{"A": 1,"B": 2,"C": 3,"D": 4,"E": 5,"F": 6,"G": 7}
}
data = data.replace(mapping_dict) #变量映射

数值类型缺失值

data.select_dtypes(include=[np.number]).isnull().sum().sort_values(ascending=False)

il_util                  883
mths_since_recent_inq    655
mo_sin_old_il_acct       203
mths_since_rcnt_il       203
bc_util                  109...
total_cu_tl                0
inq_fi                     0
total_rev_hi_lim           0
total_bc_limit             0
id                         0
Length: 80, dtype: int64

numColumns = data.select_dtypes(include=[np.number]).columns
msno.matrix(data[numColumns]) #缺失值可视化

<matplotlib.axes._subplots.AxesSubplot at 0x2caecfe1160>

data.select_dtypes(include=[np.number])

	id	loan_amnt	funded_amnt	funded_amnt_inv	int_rate	installment	grade	loan_status	dti	fico_range_low	...	num_tl_90g_dpd_24m	num_tl_op_past_12m	pct_tl_nvr_dlq	percent_bc_gt_75	pub_rec_bankruptcies	tax_liens	tot_hi_cred_lim	total_bal_ex_mort	total_bc_limit	total_il_high_credit_limit
18821	163425898	4500	4500	4500	0.1612	158.48	3	1	16.13	705	...	0	2	100.0	28.6	0	0	44700	10872	32800	0.0
61234	161908366	20000	20000	20000	0.2305	564.39	4	0	34.14	735	...	0	0	100.0	33.3	0	0	54349	19572	10400	22349.0
119781	159901427	10000	10000	10000	0.1862	257.32	4	0	27.84	680	...	0	3	100.0	0.0	0	0	69077	48184	9600	49477.0
49201	162292591	21000	21000	21000	0.1430	491.91	3	1	21.82	740	...	0	0	100.0	0.0	0	0	109894	66662	33800	67194.0
53727	162154208	40000	40000	40000	0.0819	814.70	1	0	27.52	700	...	0	0	100.0	50.0	0	0	207370	160985	98000	61725.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
86547	160719957	30000	30000	30000	0.0819	611.03	1	0	5.68	740	...	0	2	100.0	40.0	0	0	361548	46148	94500	0.0
69734	161401437	16000	16000	16000	0.1430	549.18	3	1	13.73	660	...	0	0	90.9	66.7	0	0	21300	15022	7800	6000.0
30947	162968064	1600	1600	1600	0.1102	52.40	2	0	17.32	715	...	0	1	100.0	50.0	0	0	63659	41808	27200	30259.0
29039	163064608	10000	10000	10000	0.1240	334.06	2	0	22.91	680	...	0	2	66.7	0.0	0	0	230024	36479	2900	60846.0
92872	160838177	23000	23000	23000	0.1774	580.81	3	1	0.00	800	...	0	0	100.0	0.0	0	0	85255	0	600	0.0

5973 rows × 80 columns

data.isnull().sum().sum()
mean_cols=data.mean()
data= data.fillna(mean_cols)

目标变量

y=data['loan_status']
x=data.drop(['loan_status'],axis=1)
#使用pandas库将类别变量编码
x =pd.get_dummies(x)

n_sample = y.shape[0]
n_pos_sample = y[y == 0].shape[0]
n_neg_sample = y[y == 1].shape[0]
print('样本个数：{}; 正样本占{:.2%}; 负样本占{:.2%}'.format(n_sample,n_pos_sample / n_sample,n_neg_sample / n_sample))
print('特征维数：', x.shape[1])

样本个数：5973; 正样本占66.97%; 负样本占33.03%
特征维数： 7167

特征工程

#数据进行分割（训练数据和测试数据）
from sklearn.model_selection  import train_test_split#测试集和训练集
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)

x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("训练数据集样本数目：%d, 测试数据集样本数目：%d" % (x_train.shape[0], x_test.shape[0]))
y_train = y_train.astype(np.int)
y_test = y_test.astype(np.int)

训练数据集样本数目：4778, 测试数据集样本数目：1195

#参数优化
from sklearn.pipeline import Pipeline #管道
from sklearn.model_selection import GridSearchCV #网格搜索交叉验证，用于选择最优的参数
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
pipes =Pipeline([('mms', MinMaxScaler()), ## 归一化操作('pca', PCA()), ## 降纬('RandomForestClassifier', RandomForestClassifier(criterion='gini'))])
# 参数
#
# estimators = [1,50,100,500]
# depth = [1,2,3,7,15]
parameters = [{"pca__n_components": [1,2,3,4],"RandomForestClassifier__n_estimators":[1,50,100,500],"RandomForestClassifier__max_depth":[1,2,3,7,15]}
]
#获取数据
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
gscv = GridSearchCV(pipes, param_grid=parameters)
gscv.fit(x_train2, y_train2)
print ("score值:",gscv.best_score_,"最优参数列表:", gscv.best_params_)

score值: 0.6720405704396591 最优参数列表: {'RandomForestClassifier__max_depth': 7, 'RandomForestClassifier__n_estimators': 500, 'pca__n_components': 4}

#标准化
ss = MinMaxScaler()#分类模型，经常使用的是minmaxscaler归一化，回归模型经常用standardscaler
x_train = ss.fit_transform(x_train, y_train)
x_test = ss.transform(x_test)
x_train.shape

(4778, 7167)

#降维
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)

[0.08187674 0.05705152 0.05380546 0.04683824]

#随机森林模型
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=2000, criterion='gini', max_depth=7, random_state=0)
forest.fit(x_train, y_train)#max_depth一般不宜设置过大，把每个模型作为一个弱分类器

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,criterion='gini', max_depth=7, max_features='auto',max_leaf_nodes=None, max_samples=None,min_impurity_decrease=0.0, min_impurity_split=None,min_samples_leaf=1, min_samples_split=2,min_weight_fraction_leaf=0.0, n_estimators=2000,n_jobs=None, oob_score=False, random_state=0, verbose=0,warm_start=False)

#模型效果评估
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
score = forest.score(x_test, y_test)
print ("准确率:%.2f%%" % (score * 100))
#模型预测
y_score = forest.predict(x_test)# prodict_proba输出概率

准确率:66.78%

# Compute ROC curve and ROC area for each class
import matplotlib.pyplot as plt
fpr,tpr,threshold = roc_curve(y_test, y_score) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
print('auc:%.2f'%(roc_auc))

auc:0.51

plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

<Figure size 432x288 with 0 Axes>

决策树

#参数优化
from sklearn.tree import DecisionTreeClassifier
pipe = Pipeline([('mms', MinMaxScaler()),('pca', PCA()),('decision', DecisionTreeClassifier(random_state=0))])# 参数
parameters = {"pca__n_components": [0.5,0.99],#设置为浮点数代表主成分方差所占最小比例的阈值"decision__criterion": ["gini", "entropy"],"decision__max_depth": [1,2,3,4,5,6,7,8,9,10]
}
#数据
x_train2, x_test2, y_train2, y_test2 = x_train1, x_test1, y_train1, y_test1
#模型构建：通过网格交叉验证，寻找最优参数列表， param_grid可选参数列表，cv：进行几折交叉验证
gscv = GridSearchCV(pipe, param_grid=parameters,cv=3)
#模型训练
gscv.fit(x_train2, y_train2)
#算法的最优解
print("最优参数列表:", gscv.best_params_)
print("score值：",gscv.best_score_)

最优参数列表: {'decision__criterion': 'gini', 'decision__max_depth': 4, 'pca__n_components': 0.99}
score值： 0.6917121178186392

#降维
from sklearn.decomposition import PCA
pca = PCA(n_components= 0.99)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
x_train.shape
print(pca.explained_variance_ratio_)

[0.34176263 0.23813938 0.22458996 0.19550803]

tree = DecisionTreeClassifier(criterion='gini', max_depth=4)
tree.fit(x_train, y_train) # fit模型训练
# 模型相关的指标输出
# print("训练集上的准确率:%.3f" % tree.score(x_train, y_train))
y_hat = tree.predict(x_test) # 获取预测值
print("准确率:%.3f" % (np.mean(y_hat == y_test)))

准确率:0.671

# Compute ROC curve and ROC area for each class
import matplotlib.pyplot as plt
fpr,tpr,threshold = roc_curve(y_test, y_hat) ###计算真正率和假正率
roc_auc = auc(fpr,tpr) ###计算auc的值
print('auc:%.2f'%(roc_auc))

auc:0.51

plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr, tpr, color='darkorange',lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

<Figure size 432x288 with 0 Axes>