基于多机器学习模型的破产预测

在第一种公开数据集上的实现:

# -*- coding: utf-8 -*-
"""
Created on Mon Sep 14 13:17:40 2020
@author: Long Yu
"""# In[1]: Import several important libs.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn import metrics
from sklearn.metrics import confusion_matrix
get_ipython().magic('matplotlib inline')# In[2]: Function definition used for data process and model training.'''Function of splitting the data to features and the labels'''
def preprocessdata(raw_data):
#    labels_bankruptcy_flag, bankruptcy_factors = dmatrices('class ~ trans_cf_td + trans_ca_cl + trans_re_ta + trans_ni_ta + trans_td_ta + trans_s_ta + trans_wc_ta + trans_wc_s + trans_c_cl + trans_cl_e + trans_in_s + trans_mve_td',
#                      raw_data, return_type="dataframe")labels_bankruptcy_flag=raw_data['class']labels_bankruptcy_flag=np.array(labels_bankruptcy_flag,dtype=float)
#        labels_bankruptcy_flag = np.ravel(labels_bankruptcy_flag)bankruptcy_factors=raw_data.copy()bankruptcy_factors=bankruptcy_factors.drop(['class'],axis=1)
#    bankruptcy_factors=bankruptcy_factors.drop(['ID'],axis=1)
#    return labels_bankruptcy_flag,bankruptcy_factors'''Function of calculating performance indexes'''
def performance_indexes(true_labels,predicted_labels, predicted_proba=[]):print (metrics.accuracy_score(true_labels,predicted_labels))if len(predicted_proba):print (metrics.roc_auc_score(true_labels, predicted_proba[:, 1]))print (metrics.confusion_matrix(true_labels,predicted_labels))print (metrics.classification_report(true_labels,predicted_labels))cal_confusion_mat = confusion_matrix(true_labels,predicted_labels)plt.figure(figsize=(10,6))sns.heatmap(cal_confusion_mat,  xticklabels=['Non Bankrupt', 'Bankrupt'], yticklabels=['Non Bankrupt', 'Bankrupt'])plt.show()return cal_confusion_mat'''Function of training bankruptcy model'''
def train_bankruptcy_model(training_data,select_model):'''2_1.split the training data to features and the labels'''train_label_bankruptcy_flag,training_bankruptcy_factors=preprocessdata(training_data)print (training_bankruptcy_factors.columns)'''2_2.build the selected machine learning model'''if select_model=='LR':# Logistic Regressionfrom sklearn.linear_model import LogisticRegressionmodel = LogisticRegression()elif select_model=='Dtree':# Decision Treefrom sklearn.tree import DecisionTreeClassifiermodel = DecisionTreeClassifier()elif select_model=='MLP':# MLP Neural Networkfrom sklearn.neural_network import MLPClassifiermodel = MLPClassifier(hidden_layer_sizes=(12,12,12))elif select_model=='SVM':# Support Vector Machinefrom sklearn.svm import SVCmodel = SVC(probability = True)'''2_3.training model'''model = model.fit(training_bankruptcy_factors, train_label_bankruptcy_flag)# check the accuracy on the training setacc=model.score(training_bankruptcy_factors, train_label_bankruptcy_flag)print('Evaluation of ',select_model,' model using the training data: ',acc)#    print('Percentage of bankruptcy on training data:',train_label_bankruptcy_flag.mean())############################## analysis and results ###################################'''2_4.predict labels of training data using model'''predicted_train_labels = model.predict(training_bankruptcy_factors)
#    print (predicted_train_labels)'''2_5.probabilities of classification by model'''proba_training = model.predict_proba(training_bankruptcy_factors)
#    print (proba_training)'''2_6.calculate score, confusion matrix and other performance indexes'''train_confusion_mat=performance_indexes(train_label_bankruptcy_flag, predicted_train_labels, proba_training)'''2_7.calculate VIF'''from statsmodels.stats.outliers_influence import variance_inflation_factorvif = [variance_inflation_factor(training_bankruptcy_factors.values, i) for i in range(training_bankruptcy_factors.shape[1])]
#    print(vif)return model,vif'''Function of training bankruptcy model'''
def predict_bankruptcy_result(test_label_bankruptcy_flag,test_bankruptcy_factors,select_model,bankruptcy_model):############################## analysis and results ###################################'''2_1.predict labels of testing data using model'''predicted_test_labels = bankruptcy_model.predict(test_bankruptcy_factors)print (predicted_test_labels)acc=bankruptcy_model.score(test_bankruptcy_factors, test_label_bankruptcy_flag)print('Evaluation of ',select_model,' model on testing bankruptcy data: ',acc)print('Percentage of bankruptcy on testing bankruptcy data:',test_label_bankruptcy_flag.mean())'''2_2.probabilities of classification by model'''proba_testing = bankruptcy_model.predict_proba(test_bankruptcy_factors)print (proba_testing)'''2_3.calculate score, confusion matrix and other performance indexes'''test_confusion_mat=performance_indexes(test_label_bankruptcy_flag, predicted_test_labels, proba_testing)predicted_test_labels = pd.Series(predicted_test_labels)return predicted_test_labels,proba_testing# In[3]: Classification main function with training and testing.
'''load data and preprocess'''
from scipy.io import arff
select_data="1year.arff"
All_bankruptcy_data,meta=arff.loadarff(select_data)
All_bankruptcy_data=pd.DataFrame(All_bankruptcy_data)
All_bankruptcy_data['class']=All_bankruptcy_data['class'].apply(lambda row_x: int(bytes.decode(row_x)))All_bankruptcy_data = All_bankruptcy_data.drop(columns=['Attr37', 'Attr21'])
All_bankruptcy_data.fillna(0, inplace=True)'''3_1.select the training data'''
training_bankruptcy_data = All_bankruptcy_data.sample(frac=0.5, random_state=0)'''3_2.plot the bar graph reflecting the count of two labels -- bankruptcy or not'''
plt.figure(figsize=(10,6))
sns.countplot(x='class',data = training_bankruptcy_data)
plt.show()'''3_3.load the testing data'''
testing_bankruptcy_data=All_bankruptcy_data.loc[~All_bankruptcy_data.index.isin(training_bankruptcy_data.index)]
testing_bankruptcy_data.head()
'''3_4.plot the bar graph reflecting the count of two labels -- bankruptcy or not'''
plt.figure(figsize=(10,6))
sns.countplot(x='class',data = testing_bankruptcy_data)
plt.show()
'''3_5.split the testing data to features and the labels'''
test_label_bankruptcy_flag,test_bankruptcy_factors=preprocessdata(testing_bankruptcy_data)
#    y_test, X_test = dmatrices('class ~ trans_cf_td + trans_ca_cl + trans_re_ta + trans_ni_ta + trans_td_ta + trans_s_ta + trans_wc_ta + trans_wc_s + trans_c_cl + trans_cl_e + trans_in_s + trans_mve_td',
#                      test_data, return_type="dataframe")model_name_all=['LR','Dtree','MLP','SVM']
composite_predlabels = pd.DataFrame()
#select_model='LR'
for select_model in model_name_all:print('------Using ',select_model,' model for training------')'''3_5.training the bankruptcy model'''bankruptcy_model,bankruptcy_VIF=train_bankruptcy_model(training_bankruptcy_data,select_model)'''3_6.testing the bankruptcy testing data'''predicted_test_labels,proba_testing=predict_bankruptcy_result(test_label_bankruptcy_flag,test_bankruptcy_factors,select_model,bankruptcy_model)'''3_7.generate composite predictive labels'''composite_predlabels[select_model] = predicted_test_labels#print (composite_predlabels)
composite_predicted_bankrupt = composite_predlabels[['LR','MLP','Dtree']].mode(axis=1,numeric_only=True)
#print(composite_predicted_bankrupt)
print (metrics.accuracy_score(test_label_bankruptcy_flag, composite_predicted_bankrupt))'''3_8.calculate score, confusion matrix and other performance indexes'''
final_test_confusion_mat=performance_indexes(test_label_bankruptcy_flag, composite_predicted_bankrupt)

在第二种数据集上的实现如下:

# -*- coding: utf-8 -*-
"""
Created on Mon Sep 14 13:17:40 2020@author: Long Yu
"""# In[1]: Import several important libs.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrices
from sklearn import metrics
from sklearn.metrics import confusion_matrix
get_ipython().magic('matplotlib inline')# In[2]: Function definition used for data process and model training.'''Function of splitting the data to features and the labels'''
def preprocessdata(raw_data):
#    labels_bankruptcy_flag, bankruptcy_factors = dmatrices('bstatus ~ trans_cf_td + trans_ca_cl + trans_re_ta + trans_ni_ta + trans_td_ta + trans_s_ta + trans_wc_ta + trans_wc_s + trans_c_cl + trans_cl_e + trans_in_s + trans_mve_td',
#                      raw_data, return_type="dataframe")labels_bankruptcy_flag=raw_data['bstatus']labels_bankruptcy_flag=np.array(labels_bankruptcy_flag,dtype=float)
#        labels_bankruptcy_flag = np.ravel(labels_bankruptcy_flag)bankruptcy_factors=raw_data.copy()bankruptcy_factors=bankruptcy_factors.drop(['bstatus'],axis=1)bankruptcy_factors=bankruptcy_factors.drop(['ID'],axis=1)
#    return labels_bankruptcy_flag,bankruptcy_factors'''Function of calculating performance indexes'''
def performance_indexes(true_labels,predicted_labels, predicted_proba=[]):print (metrics.accuracy_score(true_labels,predicted_labels))if len(predicted_proba):print (metrics.roc_auc_score(true_labels, predicted_proba[:, 1]))print (metrics.confusion_matrix(true_labels,predicted_labels))print (metrics.classification_report(true_labels,predicted_labels))cal_confusion_mat = confusion_matrix(true_labels,predicted_labels)plt.figure(figsize=(10,6))sns.heatmap(cal_confusion_mat,  xticklabels=['Non Bankrupt', 'Bankrupt'], yticklabels=['Non Bankrupt', 'Bankrupt'])plt.show()return cal_confusion_mat'''Function of training bankruptcy model'''
def train_bankruptcy_model(training_data,select_model):'''2_1.split the training data to features and the labels'''train_label_bankruptcy_flag,training_bankruptcy_factors=preprocessdata(training_data)print (training_bankruptcy_factors.columns)'''2_2.build the selected machine learning model'''if select_model=='LR':# Logistic Regressionfrom sklearn.linear_model import LogisticRegressionmodel = LogisticRegression()elif select_model=='Dtree':# Decision Treefrom sklearn.tree import DecisionTreeClassifiermodel = DecisionTreeClassifier()elif select_model=='MLP':# MLP Neural Networkfrom sklearn.neural_network import MLPClassifiermodel = MLPClassifier(hidden_layer_sizes=(12,12,12))elif select_model=='SVM':# Support Vector Machinefrom sklearn.svm import SVCmodel = SVC(probability = True)'''2_3.training model'''model = model.fit(training_bankruptcy_factors, train_label_bankruptcy_flag)# check the accuracy on the training setacc=model.score(training_bankruptcy_factors, train_label_bankruptcy_flag)print('Evaluation of ',select_model,' model using the training data: ',acc)#    print('Percentage of bankruptcy on training data:',train_label_bankruptcy_flag.mean())############################## analysis and results ###################################'''2_4.predict labels of training data using model'''predicted_train_labels = model.predict(training_bankruptcy_factors)
#    print (predicted_train_labels)'''2_5.probabilities of classification by model'''proba_training = model.predict_proba(training_bankruptcy_factors)
#    print (proba_training)'''2_6.calculate score, confusion matrix and other performance indexes'''train_confusion_mat=performance_indexes(train_label_bankruptcy_flag, predicted_train_labels, proba_training)'''2_7.calculate VIF'''from statsmodels.stats.outliers_influence import variance_inflation_factorvif = [variance_inflation_factor(training_bankruptcy_factors.values, i) for i in range(training_bankruptcy_factors.shape[1])]
#    print(vif)return model,vif'''Function of training bankruptcy model'''
def predict_bankruptcy_result(test_label_bankruptcy_flag,test_bankruptcy_factors,select_model,bankruptcy_model):############################## analysis and results ###################################'''2_1.predict labels of testing data using model'''predicted_test_labels = bankruptcy_model.predict(test_bankruptcy_factors)print (predicted_test_labels)acc=bankruptcy_model.score(test_bankruptcy_factors, test_label_bankruptcy_flag)print('Evaluation of ',select_model,' model on testing bankruptcy data: ',acc)print('Percentage of bankruptcy on testing bankruptcy data:',test_label_bankruptcy_flag.mean())'''2_2.probabilities of classification by model'''proba_testing = bankruptcy_model.predict_proba(test_bankruptcy_factors)print (proba_testing)'''2_3.calculate score, confusion matrix and other performance indexes'''test_confusion_mat=performance_indexes(test_label_bankruptcy_flag, predicted_test_labels, proba_testing)predicted_test_labels = pd.Series(predicted_test_labels)return predicted_test_labels,proba_testing# In[3]: Classification main function with training and testing.'''3_1.select the training data'''
#select_train_data="./train_subset_one.csv"
select_train_data="./train_subset_two.csv"
#select_train_data="./train_subset_three.csv"
training_bankruptcy_data = pd.read_csv(select_train_data)
training_bankruptcy_data.head()'''3_2.plot the bar graph reflecting the count of two labels -- bankruptcy or not'''
plt.figure(figsize=(10,6))
sns.countplot(x='bstatus',data = training_bankruptcy_data)
plt.show()'''3_3.load the testing data'''
testing_bankruptcy_data = pd.read_csv("./test_data_new.csv")
testing_bankruptcy_data.head()
'''3_4.plot the bar graph reflecting the count of two labels -- bankruptcy or not'''
plt.figure(figsize=(10,6))
sns.countplot(x='bstatus',data = testing_bankruptcy_data)
plt.show()
'''3_5.split the testing data to features and the labels'''
test_label_bankruptcy_flag,test_bankruptcy_factors=preprocessdata(testing_bankruptcy_data)
#    y_test, X_test = dmatrices('bstatus ~ trans_cf_td + trans_ca_cl + trans_re_ta + trans_ni_ta + trans_td_ta + trans_s_ta + trans_wc_ta + trans_wc_s + trans_c_cl + trans_cl_e + trans_in_s + trans_mve_td',
#                      test_data, return_type="dataframe")model_name_all=['LR','Dtree','MLP','SVM']
composite_predlabels = pd.DataFrame()
#select_model='LR'
for select_model in model_name_all:print('------Using ',select_model,' model for training------')'''3_5.training the bankruptcy model'''bankruptcy_model,bankruptcy_VIF=train_bankruptcy_model(training_bankruptcy_data,select_model)'''3_6.testing the bankruptcy testing data'''predicted_test_labels,proba_testing=predict_bankruptcy_result(test_label_bankruptcy_flag,test_bankruptcy_factors,select_model,bankruptcy_model)'''3_7.generate composite predictive labels'''composite_predlabels[select_model] = predicted_test_labels#print (composite_predlabels)
composite_predicted_bankrupt = composite_predlabels[['LR','MLP','Dtree']].mode(axis=1,numeric_only=True)
#print(composite_predicted_bankrupt)
print (metrics.accuracy_score(test_label_bankruptcy_flag, composite_predicted_bankrupt))'''3_8.calculate score, confusion matrix and other performance indexes'''
final_test_confusion_mat=performance_indexes(test_label_bankruptcy_flag, composite_predicted_bankrupt)

以上代码还有待解决的工作:数据不平衡处理(过采样、欠采样方法),数据丢失补齐,多模型融合。

基于多机器学习模型的破产预测相关推荐

  1. 基于Keras机器学习库的分类预测

    在前面的博文中,我们分享了<基于scikit-learn机器学习库的分类预测>,本文将分享Keras机器学习库的分类预测. 一旦你在Keras中选择好机器学习模型,就可以用它来预测新的数据 ...

  2. 基于张量机器学习模型_什么是基于模型的机器学习?

    基于张量机器学习模型 About Tom: Tom Diethe is a research fellow on the SPHERE project at the University of Bri ...

  3. 使用pmml跨平台部署机器学习模型Demo——房价预测

      基于房价数据,在python中训练得到一个线性回归的模型,在JavaWeb中加载模型完成房价预测的功能. 一. 训练.保存模型 工具:PyCharm-2017.Python-39.sklearn2 ...

  4. 基于scikit-learn机器学习库的分类预测

    一旦你在scikit-learn中选择好机器学习模型,就可以用它来预测新的数据实例.初学者经常会有这样的疑问: 如何在scikit-learn中用我自己的模型进行预测? 在本教程中,你将会发现如何在P ...

  5. 基于时间序列AR模型的PHM预测

    基于时间序列AR模型的PHM预测 由于时间序列分析方法是一个小样本理论,应用起来方便简单,符合实际工程中样本数量较小的情况的需求. 在工程领域,自回归(AR)模型比滑动平均(MA)模型和自回归滑动平均 ...

  6. 机器学习(一):基于Logistic回归模型的分类预测(算法实践)——阿里云天池

    文章目录 前言 一.逻辑回归的介绍和应用 1.1 逻辑回归的应用 二.逻辑回归案例 2.1.引入库 2.2读入数据 2.3.调用函数拟合数据 2.4.设置边界 2.5.预测数据 2.6.预测数据值 总 ...

  7. 机器学习模型评估与预测

    模型评估与预测 1.1经验误差与过拟合 1.2 评估方法 1.2.1留出法(hold-out) 1.2.2交叉验证法 1.2.3 自助法 1.3性能度量 1.3.1 查准率,查全率,准确率 1.3.2 ...

  8. 基于大规模机器学习模型的推荐系统

    推荐系统的本质是什么? 比如说我们看到手机淘宝首页,往下一拉,就能看到各种各样推荐的商品:比如说百度,它会给我们推荐广告,在某种程度上他的工作方式也很像推荐系统:再比如说今日头条,今日头条从数十万的新 ...

  9. 基于神经网络rnn模型的心脏病预测案例详细教程

    目录 预备准备: 数据准备: 数据预处理 : 构建RNN模型: 编译模型

最新文章

  1. webBrowse无法正常显示Excel
  2. Wi-Fi信号满格网速就一定快吗?
  3. ACM成长之路(干货) 我爱ACM,与君共勉
  4. 西安工业学院计算机系王翊,西安文理学院艺术学院
  5. chrome linux添加图标,分享|在 Linux 下体验谷歌 Material风格的GTK和图标主题Paper
  6. JMP M16:64
  7. STL 算法罗列 (转)
  8. 反激式开关电源电路测试记录(二)
  9. 机器学习算法——GBDT
  10. 计算机视觉之图像分割——Snake模型(1译文)
  11. IEEE754-2008 标准详解(五):异常
  12. 我爬取了淘宝零食库,画了一个全国零食地图
  13. Android手表商场项目总结
  14. 优麒麟这款工具,助你提高60%的工作效率
  15. 微信公众号数据2019_2019年微信公众号文章数据报告
  16. qlv文件怎么打开 怎么把qlv文件转换成MP4的文件呢
  17. ASO优化优缺点各是什么?带你学会常见的优化手段
  18. 3DMAX模型转换为gltf格式总结与问题汇总
  19. 小米红米全系列官方原厂预装系统
  20. 编码格式之间进行文本内码转换的库

热门文章

  1. LED、CCFL、TFT屏幕三者的区别
  2. 股票大数据分析软件V2.7
  3. 小米5 android 8.0原生,终于开吃奥利奥!小米5 Android 8.0 内测体验计划开启
  4. Abaqus基础问题解答
  5. sql查询当天 当月 当年
  6. IPFS(中文白皮书)
  7. 计算机高考英语,高考英语优秀作文 Computer(计算机)
  8. roll() java_java.util.Calendar.roll(int field,int amount)方法实例
  9. 又一家著名游戏公司在西雅图开分店了
  10. (附源码)spring boot大学生综合素质测评系统 毕业设计 162308