ML之RF:kaggle比赛之利用泰坦尼克号数据集建立RF模型对每个人进行获救是否预测

目录

输出结果

实现代码


输出结果

后期更新……

实现代码

#预测模型选择的RF
import numpy as np
import pandas as pd
from pandas import  DataFrame
from patsy import dmatrices
import string
from operator import itemgetter
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split,StratifiedShuffleSplit,StratifiedKFold
from sklearn import preprocessing
from sklearn.metrics import classification_report
from sklearn.externals import joblib##Read configuration parameterstrain_file="train.csv"
MODEL_PATH="./"
test_file="test.csv"
SUBMISSION_PATH="./"
seed= 0print(train_file,seed)# 输出得分
def report(grid_scores, n_top=3):top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]for i, score in enumerate(top_scores):print("Model with rank: {0}".format(i + 1))print("Mean validation score: {0:.3f} (std: {1:.3f})".format(score.mean_validation_score,np.std(score.cv_validation_scores)))print("Parameters: {0}".format(score.parameters))print("")#清理和处理数据
def substrings_in_string(big_string, substrings):for substring in substrings:if string.find(big_string, substring) != -1:return substringprint(big_string)return np.nanle = preprocessing.LabelEncoder()
enc=preprocessing.OneHotEncoder()def clean_and_munge_data(df):#处理缺省值df.Fare = df.Fare.map(lambda x: np.nan if x==0 else x)#处理一下名字,生成Title字段title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev','Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess','Don', 'Jonkheer']df['Title']=df['Name'].map(lambda x: substrings_in_string(x, title_list))#处理特殊的称呼,全处理成mr, mrs, miss, masterdef replace_titles(x):title=x['Title']if title in ['Mr','Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:return 'Mr'elif title in ['Master']:return 'Master'elif title in ['Countess', 'Mme','Mrs']:return 'Mrs'elif title in ['Mlle', 'Ms','Miss']:return 'Miss'elif title =='Dr':if x['Sex']=='Male':return 'Mr'else:return 'Mrs'elif title =='':if x['Sex']=='Male':return 'Master'else:return 'Miss'else:return titledf['Title']=df.apply(replace_titles, axis=1)#看看家族是否够大,咳咳df['Family_Size']=df['SibSp']+df['Parch']df['Family']=df['SibSp']*df['Parch']df.loc[ (df.Fare.isnull())&(df.Pclass==1),'Fare'] =np.median(df[df['Pclass'] == 1]['Fare'].dropna())df.loc[ (df.Fare.isnull())&(df.Pclass==2),'Fare'] =np.median( df[df['Pclass'] == 2]['Fare'].dropna())df.loc[ (df.Fare.isnull())&(df.Pclass==3),'Fare'] = np.median(df[df['Pclass'] == 3]['Fare'].dropna())df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)df['AgeFill']=df['Age']mean_ages = np.zeros(4)mean_ages[0]=np.average(df[df['Title'] == 'Miss']['Age'].dropna())mean_ages[1]=np.average(df[df['Title'] == 'Mrs']['Age'].dropna())mean_ages[2]=np.average(df[df['Title'] == 'Mr']['Age'].dropna())mean_ages[3]=np.average(df[df['Title'] == 'Master']['Age'].dropna())df.loc[ (df.Age.isnull()) & (df.Title == 'Miss') ,'AgeFill'] = mean_ages[0]df.loc[ (df.Age.isnull()) & (df.Title == 'Mrs') ,'AgeFill'] = mean_ages[1]df.loc[ (df.Age.isnull()) & (df.Title == 'Mr') ,'AgeFill'] = mean_ages[2]df.loc[ (df.Age.isnull()) & (df.Title == 'Master') ,'AgeFill'] = mean_ages[3]df['AgeCat']=df['AgeFill']df.loc[ (df.AgeFill<=10) ,'AgeCat'] = 'child'df.loc[ (df.AgeFill>60),'AgeCat'] = 'aged'df.loc[ (df.AgeFill>10) & (df.AgeFill <=30) ,'AgeCat'] = 'adult'df.loc[ (df.AgeFill>30) & (df.AgeFill <=60) ,'AgeCat'] = 'senior'df.Embarked = df.Embarked.fillna('S')df.loc[ df.Cabin.isnull()==True,'Cabin'] = 0.5df.loc[ df.Cabin.isnull()==False,'Cabin'] = 1.5df['Fare_Per_Person']=df['Fare']/(df['Family_Size']+1)#Age times classdf['AgeClass']=df['AgeFill']*df['Pclass']df['ClassFare']=df['Pclass']*df['Fare_Per_Person']df['HighLow']=df['Pclass']df.loc[ (df.Fare_Per_Person<8) ,'HighLow'] = 'Low'df.loc[ (df.Fare_Per_Person>=8) ,'HighLow'] = 'High'le.fit(df['Sex'] )x_sex=le.transform(df['Sex'])df['Sex']=x_sex.astype(np.float)le.fit( df['Ticket'])x_Ticket=le.transform( df['Ticket'])df['Ticket']=x_Ticket.astype(np.float)le.fit(df['Title'])x_title=le.transform(df['Title'])df['Title'] =x_title.astype(np.float)le.fit(df['HighLow'])x_hl=le.transform(df['HighLow'])df['HighLow']=x_hl.astype(np.float)le.fit(df['AgeCat'])x_age=le.transform(df['AgeCat'])df['AgeCat'] =x_age.astype(np.float)le.fit(df['Embarked'])x_emb=le.transform(df['Embarked'])df['Embarked']=x_emb.astype(np.float)df = df.drop(['PassengerId','Name','Age','Cabin'], axis=1) #remove Name,Age and PassengerIdreturn df#读取数据
traindf=pd.read_csv(train_file)
##清洗数据
df=clean_and_munge_data(traindf)
########################################formula################################formula_ml='Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size' y_train, x_train = dmatrices(formula_ml, data=df, return_type='dataframe')
y_train = np.asarray(y_train).ravel()
print(y_train.shape,x_train.shape)##选择训练和测试集
X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.2,random_state=seed)
#初始化分类器
clf=RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=5, min_samples_split=1,min_samples_leaf=1, max_features='auto',    bootstrap=False, oob_score=False, n_jobs=1, random_state=seed,verbose=0)###grid search找到最好的参数
param_grid = dict( )
##创建分类pipeline
pipeline=Pipeline([ ('clf',clf) ])
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=3,scoring='accuracy',\
cv=StratifiedShuffleSplit(Y_train, n_iter=10, test_size=0.2, train_size=None, indices=None, \
random_state=seed, n_iterations=None)).fit(X_train, Y_train)
# 对结果打分
print("Best score: %0.3f" % grid_search.best_score_)
print(grid_search.best_estimator_)
report(grid_search.grid_scores_)print('-----grid search end------------')
print ('on all train set')
scores = cross_val_score(grid_search.best_estimator_, x_train, y_train,cv=3,scoring='accuracy')
print(scores.mean(),scores)
print ('on test set')
scores = cross_val_score(grid_search.best_estimator_, X_test, Y_test,cv=3,scoring='accuracy')
print(scores.mean(),scores)# 对结果打分print(classification_report(Y_train, grid_search.best_estimator_.predict(X_train) ))
print('test data')
print(classification_report(Y_test, grid_search.best_estimator_.predict(X_test) ))model_file=MODEL_PATH+'model-rf.pkl'
joblib.dump(grid_search.best_estimator_, model_file)

ML之RF:kaggle比赛之利用泰坦尼克号数据集建立RF模型对每个人进行获救是否预测相关推荐

  1. EL之Bagging:kaggle比赛之利用泰坦尼克号数据集建立Bagging模型对每个人进行获救是否预测

    EL之Bagging:kaggle比赛之利用泰坦尼克号数据集建立Bagging模型对每个人进行获救是否预测 目录 输出结果 设计思路 核心代码 输出结果 设计思路 核心代码 bagging_clf = ...

  2. ML之LoR:kaggle比赛之利用泰坦尼克号数据集建立LoR模型对每个人进行获救是否预测

    比赛要求:根据训练集数据和测试集数据生成自己的预测模型,按照预测模型来预测出892到1309条数据是否获救,按照比赛规定的格式生成csv文件,并上传到kaggle上,然后会反馈预测的准确率. 导读: ...

  3. TF之pix2pix:基于TF利用Facades数据集训练pix2pix模型、测试并进行生成过程全记录

    TF之pix2pix:基于TF利用Facades数据集训练pix2pix模型.测试并进行生成过程全记录 目录 TB监控 1.SCALARS 2.IMAGES 3.GRAPHS 4.DISTRIBUTI ...

  4. 用matlab建立晶体模型,利用materials studio建立晶体模型的步骤 | 附下载

    1.启动materials studio时会提示:create a new project or open an existing project 在这里选择create a new project, ...

  5. ML之LoRBaggingRF:依次利用LoR、Bagging、RF算法对泰坦尼克号数据集 (Kaggle经典案例)获救人员进行二分类预测(最全)

    ML之LoR&Bagging&RF:依次利用LoR.Bagging.RF算法对泰坦尼克号数据集 (Kaggle经典案例)获救人员进行二分类预测 目录 输出结果 设计思路 核心代码 输出 ...

  6. ML之LoRBaggingRF:依次利用LoR、Bagging、RF算法对titanic(泰坦尼克号)数据集 (Kaggle经典案例)获救人员进行二分类预测(最全)

    ML之LoR&Bagging&RF:依次利用LoR.Bagging.RF算法对titanic(泰坦尼克号)数据集 (Kaggle经典案例)获救人员进行二分类预测 目录 输出结果 设计思 ...

  7. Kaggle比赛心得

    正文共5453个字,5张图,预计阅读时间14分钟. 最近参加了两场Kaggle比赛,收获颇多,一直想写篇文章总结一下.接触Kaggle到现在不到一年,比赛成绩一个银牌(5%)一个铜牌(9%),勉强算入 ...

  8. ML之LoRBaggingRF:依次利用Bagging、RF算法对泰坦尼克号数据集 (Kaggle经典案例)获救人员进行二分类预测——模型融合

    ML之LoR&Bagging&RF:依次利用Bagging.RF算法对泰坦尼克号数据集 (Kaggle经典案例)获救人员进行二分类预测--模型融合 目录 输出结果 设计思路 核心代码 ...

  9. ML之LoRBaggingRF:依次利用LoR、Bagging、RF算法对泰坦尼克号数据集 (Kaggle经典案例)获救人员进行二分类预测——优化baseline模型

    ML之LoR&Bagging&RF:依次利用LoR.Bagging.RF算法对泰坦尼克号数据集 (Kaggle经典案例)获救人员进行二分类预测--优化baseline模型 目录 模型优 ...

最新文章

  1. 给大家分享微信小说域名防封最新的解决方案
  2. Window平台实时流媒体
  3. 【转】The test form is only available for requests from the local machine 解决方法
  4. MySQL主从复制故障解决
  5. Spark2内存调优总结 - 内存划分 与 内存计算 与 调参方式
  6. 【原型设计】第二节:Axure RP9制作自己的元件库的操作教程
  7. SQL数据层面操作(DML)
  8. html5游戏开发-零基础开发《圣诞老人送礼物》小游戏
  9. iOS UIView异步绘制实现圆角的方案
  10. vb不能插入png图片_VB6.0载入PNG格式图片
  11. 仅当使用了列列表并且 IDENTITY_INSERT 为 ON 时,才能为表'XXX'中的标识列指定显式值。...
  12. 《Java语言程序设计》(基础篇原书第10版)第九章复习题答案
  13. navicat中如何查看mysql日志_如何查看 Navicat Premium 日志文件
  14. 澳洲国立大学的计算机专业,澳洲国立大学计算机专业前景和申请详解
  15. 后端开发面试自我介绍_前端开发面试自我介绍
  16. 尚学堂怎么样?给你讲讲我的亲身经历
  17. 安装程序 Repack 攻略 之 CAB 篇
  18. 周报 | 吉吉拍助力消费者转变
  19. vue里面使用echarts实现根据浏览器屏幕大小自适应
  20. 淘最热点AutoJs脚本分享

热门文章

  1. 一个数里有那些约数用c++怎么做_如何从一堆数里找出哪几个数相加等于你要的值?...
  2. ZYNQ 调试遇到的问题
  3. dos下实现延迟功能
  4. Linux 最常用命令:简单易学,但能解决 95% 以上的问题
  5. 为什么中国程序员水平一直上不了层次?无非是这些原因!
  6. 又到校招季,来说说面试和实习
  7. 微软研究员:fork() 已落后,需要淘汰
  8. 又是一个程序员粗心的代码引起频繁FullGC的案例
  9. Nomad技术手册:调度(Scheduling)
  10. 搭建rabbitmq的docker集群