%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def read_dataset(fname):# 指定第一列作为行索引data = pd.read_csv(fname, index_col=0) # 丢弃无用的数据data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)# 处理性别数据data['Sex'] = (data['Sex'] == 'male').astype('int')# 处理登船港口数据labels = data['Embarked'].unique().tolist()data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n))# 处理缺失数据data = data.fillna(0)return datatrain = read_dataset('datasets/titanic/train.csv')
train.head()
Survived Pclass Sex Age SibSp Parch Fare Embarked
PassengerId
1 0 3 1 22.0 1 0 7.2500 0
2 1 1 0 38.0 1 0 71.2833 1
3 1 3 0 26.0 0 0 7.9250 0
4 1 1 0 35.0 1 0 53.1000 0
5 0 3 1 35.0 0 0 8.0500 0
from sklearn.model_selection import train_test_splity = train['Survived'].values
X = train.drop(['Survived'], axis=1).valuesX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)print('train dataset: {0}; test dataset: {1}'.format(X_train.shape, X_test.shape))
train dataset: (712, 7); test dataset: (179, 7)
from sklearn.tree import DecisionTreeClassifierclf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))
train score: 0.9901685393258427; test score: 0.7821229050279329
from sklearn.tree import export_graphvizwith open("titanic.dot", 'w') as f:f = export_graphviz(clf, out_file=f)
# 参数选择 max_depth
def cv_score(d):clf = DecisionTreeClassifier(max_depth=d)clf.fit(X_train, y_train)tr_score = clf.score(X_train, y_train)cv_score = clf.score(X_test, y_test)return (tr_score, cv_score)depths = range(2, 15)
scores = [cv_score(d) for d in depths]
tr_scores = [s[0] for s in scores]
cv_scores = [s[1] for s in scores]best_score_index = np.argmax(cv_scores)
best_score = cv_scores[best_score_index]
best_param = depths[best_score_index]
print('best param: {0}; best score: {1}'.format(best_param, best_score))plt.figure(figsize=(10, 6), dpi=144)
plt.grid()
plt.xlabel('max depth of decision tree')
plt.ylabel('score')
plt.plot(depths, cv_scores, '.g-', label='cross-validation score')
plt.plot(depths, tr_scores, '.r--', label='training score')
plt.legend()
best param: 4; best score: 0.8268156424581006<matplotlib.legend.Legend at 0x19bd6cbb4e0>

# 训练模型,并计算评分
def cv_score(val):clf = DecisionTreeClassifier(criterion='gini',min_impurity_split=val)clf.fit(X_train, y_train)tr_score = clf.score(X_train, y_train)cv_score = clf.score(X_test, y_test)return (tr_score, cv_score)# 指定参数范围,分别训练模型,并计算评分
values = np.linspace(0, 0.5, 50)
scores = [cv_score(v) for v in values]
tr_scores = [s[0] for s in scores]
cv_scores = [s[1] for s in scores]# 找出评分最高的模型参数
best_score_index = np.argmax(cv_scores)
best_score = cv_scores[best_score_index]
best_param = values[best_score_index]
print('best param: {0}; best score: {1}'.format(best_param, best_score))# 画出模型参数与模型评分的关系
plt.figure(figsize=(10, 6), dpi=144)
plt.grid()
plt.xlabel('threshold of entropy')
plt.ylabel('score')
plt.plot(values, cv_scores, '.g-', label='cross-validation score')
plt.plot(values, tr_scores, '.r--', label='training score')
plt.legend()
best param: 0.17346938775510204; best score: 0.8212290502793296<matplotlib.legend.Legend at 0x19be2859e80>

def plot_curve(train_sizes, cv_results, xlabel):train_scores_mean = cv_results['mean_train_score']train_scores_std = cv_results['std_train_score']test_scores_mean = cv_results['mean_test_score']test_scores_std = cv_results['std_test_score']plt.figure(figsize=(10, 6), dpi=144)plt.title('parameters turning')plt.grid()plt.xlabel(xlabel)plt.ylabel('score')plt.fill_between(train_sizes, train_scores_mean - train_scores_std,train_scores_mean + train_scores_std, alpha=0.1, color="r")plt.fill_between(train_sizes, test_scores_mean - test_scores_std,test_scores_mean + test_scores_std, alpha=0.1, color="g")plt.plot(train_sizes, train_scores_mean, '.--', color="r",label="Training score")plt.plot(train_sizes, test_scores_mean, '.-', color="g",label="Cross-validation score")plt.legend(loc="best")
from sklearn.model_selection import GridSearchCVthresholds = np.linspace(0, 0.5, 50)
# Set the parameters by cross-validation
param_grid = {'min_impurity_split': thresholds}clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True)
clf.fit(X, y)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_))plot_curve(thresholds, clf.cv_results_, xlabel='gini thresholds')
best param: {'min_impurity_split': 0.22448979591836732}
best score: 0.8181818181818182

from sklearn.model_selection import GridSearchCVentropy_thresholds = np.linspace(0, 1, 50)
gini_thresholds = np.linspace(0, 0.5, 50)# Set the parameters by cross-validation
param_grid = [{'criterion': ['entropy'], 'min_impurity_split': entropy_thresholds},{'criterion': ['gini'], 'min_impurity_split': gini_thresholds},{'max_depth': range(2, 10)},{'min_samples_split': range(2, 30, 2)}]clf = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, return_train_score=True)
clf.fit(X, y)
print("best param: {0}\nbest score: {1}".format(clf.best_params_, clf.best_score_))
best param: {'criterion': 'entropy', 'min_impurity_split': 0.5306122448979591}
best score: 0.8271604938271605
print(clf.best_params_)
{'criterion': 'entropy', 'min_impurity_split': 0.5306122448979591}
clf = DecisionTreeClassifier(criterion='entropy', min_impurity_split=0.002857142857142857)
model=clf.fit(X_train, y_train)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print('train score: {0}; test score: {1}'.format(train_score, test_score))# 导出 titanic.dot 文件
with open("titanic.dot", 'w') as f:f = export_graphviz(clf, out_file=f)# 1. 在电脑上安装 graphviz
# 2. 运行 `dot -Tpng titanic.dot -o titanic.png`
# 3. 在当前目录查看生成的决策树 titanic.png
train score: 0.9901685393258427; test score: 0.776536312849162
def read_dataset(fname):# 指定第一列作为行索引data = pd.read_csv(fname, index_col=0) # 丢弃无用的数据data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)# 处理性别数据data['Sex'] = (data['Sex'] == 'male').astype('int')# 处理登船港口数据labels = data['Embarked'].unique().tolist()data['Embarked'] = data['Embarked'].apply(lambda n: labels.index(n))# 处理缺失数据data = data.fillna(0)return datatrain = read_dataset('datasets/titanic/test.csv')
x_test = train.values
clf = DecisionTreeClassifier(criterion='entropy', min_impurity_split=0.002857142857142857)
# clf.fit(x_test)
ans=model.predict(x_test)
tab_2=pd.DataFrame({'Survived':ans},index=train.index)
tab_2.to_csv('1.csv')

决策树_预测泰坦尼可号幸存者相关推荐

  1. Spark:基于PySpark的逻辑回归和决策树模型对泰旦尼克号幸存者预测的机器学习流程

    pyspark的ML回顾下 文章目录 官网文档 环境 泰坦尼克号数据分析 泰坦尼克号数据清洗整理 Spark ML Pipeline Titanic幸存者预测:逻辑回归LR模型 模型训练 模型预测 T ...

  2. 机器学习代码实战——决策树(预测泰坦尼号船员生存情况)

    文章目录 1.实验目的 2.数据预处理 3.导入必要模块 4.训练+计算模型得分 1.实验目的 1.建立决策树模型以根据某些参数预测泰坦尼克号人是否得以生存 2.在泰坦尼克数据中,使用以下各列构建一个 ...

  3. 袋装决策树_袋装树是每个数据科学家需要的机器学习算法

    袋装决策树 袋装树木介绍 (Introduction to Bagged Trees) Without diving into the specifics just yet, it's importa ...

  4. 数据分享|PYTHON用决策树分类预测糖尿病和可视化实例

    全文下载链接:http://tecdat.cn/?p=23848 在本文中,决策树是对例子进行分类的一种简单表示.它是一种有监督的机器学习技术,数据根据某个参数被连续分割.决策树分析可以帮助解决分类和 ...

  5. python实现决策树建模预测并验证评估

    实现功能: python实现数据读取.数据清洗.数据编码.数据降维分析.数据集划分.(具体参见前几篇文章),决策树建模预测并验证评估. 实现代码: # 导入需要的库 import pandas as ...

  6. 有关糖尿病模型建立的论文_预测糖尿病结果的模型比较

    有关糖尿病模型建立的论文 项目主题 (Subject of the Project) The dataset is primarily used for predicting the onset of ...

  7. arima 预测模型_预测未来:学习使用Arima模型进行预测

    arima 预测模型 XTS对象 (XTS Objects) If you're not using XTS objects to perform your forecasting in R, the ...

  8. 机器学习决策树_机器学习-决策树 Decision Tree

    咱们正式进入了机器学习的模型的部分,虽然现在最火的的机器学习方面的库是Tensorflow, 但是这里还是先简单介绍一下另一个数据处理方面很火的库叫做sklearn.其实咱们在前面已经介绍了一点点sk ...

  9. 决策树留一法python代码_从实例中归纳决策树_人工智能一

    从实例中归纳决策树_人工智能一 18.3.3 从实例中归纳决策树 布尔决策树的一个实例是由一个输入属性向量X以及一个唯一的布尔输出值y组成的.一个实例集合(X1,y1),-,(X12,y12)如图18 ...

  10. python中如何画出决策树_使用Python绘制决策树

    决策树为字典格式,示例如下: {'tearRate': {'reduced': 'no lenses', 'normal': {' astigmatic': {'yes': {' prescript' ...

最新文章

  1. 解决数据库读写分离(转)
  2. oracle双机python连接_Python连接Oracle
  3. 权限管理(shiro框架)
  4. STM32F103的PC13、PB3和PB4定义为普通IO口使用
  5. JDBC之在分层结构中实现业务
  6. maven正确的集成命令-U-B
  7. Simple Lambda Sample
  8. webpack 修改title_Webpack漫谈
  9. 为知笔记linux输入中文,为知笔记wiznote无法输入中文,fcitx输入法问题解决
  10. linux查看内存条pn,查看电脑内存条型号的两种方法【图文】
  11. Python自动化运维之1、Python入门
  12. 使用python进行北京二手房信息数据分析及可视化展示
  13. 小爱音箱当电脑音箱(电脑没有蓝牙)
  14. 不能装载文档控件。请在检查浏览器的选项中检查浏览器的安全设置_【网络安全知识系列(五)】如何正确设置浏览器!...
  15. Gym 101431B Vera and Banquet (后缀数组)
  16. 计算机专业笔记本用i5还是i7,玩游戏笔记本i5和i7的区别_笔记本电脑游戏用i5还是i7...
  17. 医学图像配准实现代码(matlab篇)
  18. 星来客机器人餐厅_星战来客——韩国“机器人士兵”小记
  19. 2021年福建省安全员B证(项目负责人)考试试题及福建省安全员B证(项目负责人)作业模拟考试
  20. “离婚”华为后,荣耀第一胎满身伤痕

热门文章

  1. 获取元素到页面上的位置
  2. jQuery 属性操作 - addClass() 和 removeClass() 方法
  3. SQL Server 2005混合模式登录配置
  4. 【C++】字符串替换问题
  5. java day37【web相关概念回顾 、web服务器软件:Tomcat 、Servlet入门学习】
  6. SVN教程 -- 基于自己学习记录
  7. Django里URL配置中name参数的作用
  8. Username is not in the sudoers file. This incident will be reported
  9. [IC]Lithograph(1)光刻技术分析与展望
  10. Winform 可取消的单选按钮(RadioButton)