ta

 

白酒、泰坦尼克号例子

import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split   # cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_scoreif __name__ == "__main__":data = np.loadtxt('wine.data', dtype=float, delimiter=',')y, x = np.split(data, (1,), axis=1)  #从0开始到1不包含1x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1, test_size=0.5)# Logistic回归lr = LogisticRegression(penalty='l2')lr.fit(x_train, y_train.ravel())y_hat = lr.predict(x_test)print('Logistic回归正确率:',accuracy_score(y_test, y_hat))# XGBoost# 必须有第0类y_train[y_train == 3] = 0y_test[y_test == 3] = 0# DataMatrixdata_train = xgb.DMatrix(x_train, label=y_train)data_test = xgb.DMatrix(x_test, label=y_test)watch_list = [(data_test, 'eval'), (data_train, 'train')]#softmax三分类params = {'max_depth': 3, 'eta': 1, 'silent': 0, 'objective': 'multi:softmax', 'num_class': 3}bst = xgb.train(params, data_train, num_boost_round=2, evals=watch_list)y_hat = bst.predict(data_test)print('XGBoost正确率:', accuracy_score(y_test, y_hat))
import xgboost as xgb
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import csvdef show_accuracy(a, b, tip):acc = a.ravel() == b.ravel()acc_rate = 100 * float(acc.sum()) / a.sizeprint('%s正确率:%.3f%%' % (tip, acc_rate))return acc_ratedef load_data(file_name, is_train):data = pd.read_csv(file_name)  # 数据文件路径# print 'data.describe() = \n', data.describe()# 性别data['Sex'] = data['Sex'].map({'female': 0, 'male': 1}).astype(int)# 补齐船票价格缺失值if len(data.Fare[data.Fare.isnull()]) > 0:fare = np.zeros(3)for f in range(0, 3):fare[f] = data[data.Pclass == f + 1]['Fare'].dropna().median()for f in range(0, 3):  # loop 0 to 2data.loc[(data.Fare.isnull()) & (data.Pclass == f + 1), 'Fare'] = fare[f]# 年龄:使用均值代替缺失值# mean_age = data['Age'].dropna().mean()# data.loc[(data.Age.isnull()), 'Age'] = mean_ageif is_train:# 年龄:使用随机森林预测年龄缺失值print('随机森林预测缺失年龄:--start--')data_for_age = data[['Age', 'Survived', 'Fare', 'Parch', 'SibSp', 'Pclass']]age_exist = data_for_age.loc[(data.Age.notnull())]   # 年龄不缺失的数据age_null = data_for_age.loc[(data.Age.isnull())]# print age_existx = age_exist.values[:, 1:]y = age_exist.values[:, 0]rfr = RandomForestRegressor(n_estimators=1000)rfr.fit(x, y)age_hat = rfr.predict(age_null.values[:, 1:])# print age_hatdata.loc[(data.Age.isnull()), 'Age'] = age_hatprint('随机森林预测缺失年龄:--over--')else:print('随机森林预测缺失年龄2:--start--')data_for_age = data[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]age_exist = data_for_age.loc[(data.Age.notnull())]  # 年龄不缺失的数据age_null = data_for_age.loc[(data.Age.isnull())]# print age_existx = age_exist.values[:, 1:]y = age_exist.values[:, 0]rfr = RandomForestRegressor(n_estimators=1000)rfr.fit(x, y)age_hat = rfr.predict(age_null.values[:, 1:])# print age_hatdata.loc[(data.Age.isnull()), 'Age'] = age_hatprint('随机森林预测缺失年龄2:--over--')# 起始城市data.loc[(data.Embarked.isnull()), 'Embarked'] = 'S'  # 保留缺失出发城市# data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2, 'U': 0}).astype(int)# print data['Embarked']embarked_data = pd.get_dummies(data.Embarked)print(embarked_data)# embarked_data = embarked_data.rename(columns={'S': 'Southampton', 'C': 'Cherbourg', 'Q': 'Queenstown', 'U': 'UnknownCity'})embarked_data = embarked_data.rename(columns=lambda x: 'Embarked_' + str(x))data = pd.concat([data, embarked_data], axis=1)print(data.describe())data.to_csv('New_Data.csv')x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S']]# x = data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]y = Noneif 'Survived' in data:y = data['Survived']x = np.array(x)y = np.array(y)x = np.tile(x, (5, 1))y = np.tile(y, (5, ))if is_train:return x, yreturn x, data['PassengerId']def write_result(c, c_type):file_name = 'Titanic.test.csv'x, passenger_id = load_data(file_name, False)if type == 3:x = xgb.DMatrix(x)y = c.predict(x)y[y > 0.5] = 1y[~(y > 0.5)] = 0predictions_file = open("Prediction_%d.csv" % c_type, "wb")open_file_object = csv.writer(predictions_file)open_file_object.writerow(["PassengerId", "Survived"])open_file_object.writerows(zip(passenger_id, y))predictions_file.close()if __name__ == "__main__":x, y = load_data('Titanic.train.csv', True)x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1)lr = LogisticRegression(penalty='l2')lr.fit(x_train, y_train)y_hat = lr.predict(x_test)lr_acc = accuracy_score(y_test, y_hat)# write_result(lr, 1)rfc = RandomForestClassifier(n_estimators=100)rfc.fit(x_train, y_train)y_hat = rfc.predict(x_test)rfc_acc = accuracy_score(y_test, y_hat)# write_result(rfc, 2)# XGBoostdata_train = xgb.DMatrix(x_train, label=y_train)data_test = xgb.DMatrix(x_test, label=y_test)watch_list = [(data_test, 'eval'), (data_train, 'train')]param = {'max_depth': 6, 'eta': 0.8, 'silent': 1, 'objective': 'binary:logistic'}# 'subsample': 1, 'alpha': 0, 'lambda': 0, 'min_child_weight': 1}bst = xgb.train(param, data_train, num_boost_round=100, evals=watch_list)y_hat = bst.predict(data_test)# write_result(bst, 3)y_hat[y_hat > 0.5] = 1y_hat[~(y_hat > 0.5)] = 0xgb_acc = accuracy_score(y_test, y_hat)print('Logistic回归:%.3f%%' % lr_acc)print('随机森林:%.3f%%' % rfc_acc)print('XGBoost:%.3f%%' % xgb_acc)

机器学习算法加强——XGBoost实践相关推荐

  1. 机器学习算法与Python实践之(六)二分k均值聚类

    机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书.因为自己想学习Python,然后也想对一些机器学习算法加深下了解,所以就想通过Python来实现几个比较常用的机器学 ...

  2. 机器学习算法与Python实践之(五)k均值聚类(k-means)

    机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书.因为自己想学习Python,然后也想对一些机器学习算法加深下了解,所以就想通过Python来实现几个比较常用的机器学 ...

  3. 机器学习算法与Python实践之(三)支持向量机(SVM)进阶

    机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书.因为自己想学习Python,然后也想对一些机器学习算法加深下了解,所以就想通过Python来实现几个比较常用的机器学 ...

  4. 机器学习算法与Python实践之(二)支持向量机(SVM)初

    机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书.因为自己想学习Python,然后也想对一些机器学习算法加深下了解,所以就想通过Python来实现几个比较常用的机器学 ...

  5. 机器学习算法与Python实践之(二)支持向量机

    http://blog.csdn.net/zouxy09/article/details/17291543 机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书.因为自己 ...

  6. 机器学习算法与Python实践之(四)支持向量机(SVM)实现

    机器学习算法与Python实践之(四)支持向量机(SVM)实现 zouxy09@qq.com http://blog.csdn.net/zouxy09 机器学习算法与Python实践这个系列主要是参考 ...

  7. 机器学习算法与Python实践之(二)支持向量机(SVM)初级

    机器学习算法与Python实践之(二)支持向量机(SVM)初级 zouxy09@qq.com http://blog.csdn.net/zouxy09 机器学习算法与Python实践这个系列主要是参考 ...

  8. 机器学习算法与Python实践之(二)k近邻(KNN)

      机器学习算法与Python实践之(二)k近邻(KNN) (基于稀疏矩阵的k近邻(KNN)实现) 一.概述 这里我们先来看看当我们的数据是稀疏时,如何用稀疏矩阵的特性为KNN算法加速.KNN算法在之 ...

  9. 机器学习算法与Python实践之逻辑回归(Logistic Regression)

    转载自:http://blog.csdn.net/zouxy09/article/details/20319673 机器学习算法与Python实践这个系列主要是参考<机器学习实战>这本书. ...

最新文章

  1. 在Ubuntu 14.04 64bit上安装OpenResty 1.9.7.4
  2. JavaWeb总结(六)—Session
  3. K8S部署工具:KubeOperator主要概念
  4. java -web html5学习1
  5. 人工智能秘史(一):会下棋的土耳其机器人
  6. 每一个都能笑抽,39个奇葩代码注释
  7. oracle数据库中的系统自带表情_oracle 系统自带几个常用函数
  8. centos7升级gcc,并安装redis
  9. iOS 开发中 通过AVAssetWriter将录像视频写到指定文件
  10. 知识图谱构建技术总述
  11. java杯子换水_水壶问题(向水壶中倒z升水) Water and Jug Problem
  12. 广州三本找Java实习经历
  13. 获取pc微信信息_如何获取有关您的PC的详细信息
  14. 傅里叶分析的方方面面:复正弦、负频率
  15. python3实现softmax + 函数曲线绘制
  16. Hive beeline常用操作
  17. 并发编程之Disruptor框架介绍和高阶运用
  18. java丐帮_java多线程学习笔记(六)
  19. 计算机与化学相关论文,关于化学与计算机论文相关频道推荐
  20. [剑指Offer]-左旋转字符串

热门文章

  1. Java项目如何改成maven_普通java项目改进为maven:ecplise
  2. 文本编辑器第一阶段测试
  3. C++中多态的概念和意义
  4. C++中的静态成员变量
  5. mysql开发java心得_关于mysql 一些优化心得
  6. tensorflow max_pooling
  7. win7磁盘清理_window7越来越卡?系统残余文件太多,磁盘清理可以搞定!!
  8. 80. Leetcode 1642. 可以到达的最远建筑 (堆-技巧三-事后小诸葛)
  9. 633. Sum of Square Numbers
  10. NTU 课程笔记 CE7454作业(1):DeepFashion属性预测挑战【介绍篇】