imp遇到重复数据_数据挖掘入门 - 泰坦尼克号生存预测

import pandas as pd;
from sklearn.tree import DecisionTreeClassifier; # 导入决策树
from sklearn.linear_model import LogisticRegression; # 导入逻辑回归
from sklearn.model_selection import cross_val_score;
from sklearn.impute import SimpleImputer;
from sklearn.preprocessing import OneHotEncoder;
import matplotlib.pyplot as plttitanic_data = pd.read_csv(r"D:/titanic/train.csv")

# 初步查看数据长啥样

titanic_data.head()

# 查看数据有没有缺失值

titanic_data.info()

可以看出，有部分列有缺失值，这个后面要做相应处理。

1、数据预处理

1.1 重复值处理

titanic_data.duplicated().sum()

可以看出，数据比较好，没有缺失值

1.2 从经验看来（这步其实在实际中是跟业务强相关），cabin、name、ticket、passengerId这些特征，与最终是否获救是没有直接关系的，因此可以把这四列删除

titanic_data.drop(['Cabin', 'Name', 'Ticket', 'PassengerId'], axis=1,inplace=True);

1.3 填补缺失值

# 填补缺失值(对于Age，用均值)
imp_mean = SimpleImputer();
titanic_data.loc[:, 'Age'] = imp_mean.fit_transform(titanic_data.loc[:, 'Age'].values.reshape(-1, 1));
# 填补缺失值（对于Embarked，用众数）
titanic_data.loc[:, 'Embarked'].value_counts()
imp_mostFre = SimpleImputer(strategy='most_frequent');
titanic_data.loc[:, 'Embarked'] = imp_mostFre.fit_transform(titanic_data.loc[:, 'Embarked'].values.reshape(-1, 1));

1.4 对Sex和Embarked进行OneHotEncoder

wait_to_encode = titanic_data.loc[:, ['Sex', 'Embarked']];one_hot_encoder = OneHotEncoder(categories='auto');
tmp = one_hot_encoder.fit(wait_to_encode);
result = tmp.transform(wait_to_encode).toarray()
new_columns = ['Female', 'Male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'];titanic_data = pd.concat([titanic_data, pd.DataFrame(result, columns = new_columns)], axis=1)
titanic_data.drop(['Sex', 'Embarked'], axis=1, inplace = True)
titanic_data.head()

1.5 分特征数据集和标签数据集

titanic_feature = titanic_data.loc[:, titanic_data.columns != 'Survived'];
titanic_target = titanic_data.loc[:, titanic_data.columns == 'Survived'];

2、特征预处理

2.1 方差过滤

from sklearn.feature_selection import VarianceThresholdvarice_Selector = VarianceThreshold();
titanic_feature = varice_Selector.fit_transform(titanic_feature);

2.2 互信息过滤

from sklearn.feature_selection import mutual_info_classif as MIC;
from sklearn.feature_selection import SelectKbest;
result = MIC(titanic_feature, titanic_target);
result

可见，所有特征均与标签有关，没有不相关的（在互信息中，认为互信息为0的两个向量，他们之间是没有相关性的）。

因此，如下代码不会删除任何特征

k = result.shape[0] - sum(result <= 0)
X_fsmic = SelectKBest(MIC, k=k).fit_transform(X, y)

3、划分数据集与测试集

from sklearn.model_selection import train_test_split;
Xtrain, Xtest, Ytrain, Ytest = train_test_split(titanic_feature, titanic_target, test_size = 0.3);
# 因为使用train_test_split后，会打乱以前的排序，所以要重新赋值索引值
for i in [Xtrain, Xtest, Ytrain, Ytest]:i.index = range(i.shape[0]);

4、模型构建 - 分类决策树

clf = DecisionTreeClassifier(random_state = 25);
score_ = cross_val_score(clf, titanic_feature, titanic_target, cv = 10).mean();
score_

# 调参第一步，使用entropy试下
clf = DecisionTreeClassifier(criterion='entropy', random_state = 25);
score_ = cross_val_score(clf, titanic_feature, titanic_target, cv = 10).mean();
score_

# 调参第二步，max_depth
score = [];
for i in range(1, 21):clf = DecisionTreeClassifier(criterion='entropy', random_state=25, max_depth=i);score_ = cross_val_score(clf, titanic_feature, titanic_target, cv = 10).mean();score.append(score_);plt.figure();
plt.plot(range(1, 21), score);
plt.xticks(range(1,21))
plt.show()clf = DecisionTreeClassifier(criterion='entropy', random_state = 25, max_depth=7);
score_ = cross_val_score(clf, titanic_feature, titanic_target, cv = 10).mean();
score_

可见，max_depth的深度为7时，模型的效果最好，此时模型的分数为82.2759分

# 调参第三步，查看splitter（为best和random时），对模型的影响# 默认为best，模型的评分见上# 选择splitter为random试下clf = DecisionTreeClassifier(criterion='entropy', random_state = 25, max_depth=7, splitter='random');
score_ = cross_val_score(clf, titanic_feature, titanic_target, cv = 10).mean();
score_

结果发现，降低了，所以splitter就选择best

# 调参第四步，利用min_sample_leaf和min_leaf_split进行联合的网格搜索
from sklearn.model_selection import GridSearchCV
import numpy as npparam_grid = {'min_samples_leaf':np.arange(1, 10, 1),'min_samples_split':np.arange(2, 16, 1)};
clf = DecisionTreeClassifier(criterion='entropy', random_state = 25, max_depth=7, splitter='best');
GS = GridSearchCV(clf, param_grid, cv=10);
GS.fit(titanic_feature, titanic_target)print (GS.best_score_)
print (GS.best_params_);

# 调参第五步，调整max_features
param_grid = {'max_features': np.arange(3, 10, 1)};
clf = DecisionTreeClassifier(criterion='entropy',random_state = 25,max_depth=7,splitter='random',min_samples_leaf=4,min_samples_split=15);
GS = GridSearchCV(clf, param_grid, cv=10);
GS.fit(titanic_feature, titanic_target)print (GS.best_score_)
print (GS.best_params_)

可见，固定max_features后，准确率反而下降了，说明固定后过拟合了，因此max_features的这个参数不动

因此在使用决策树这种模型时，能达到的最高精度分数为82.6092分，此时的参数配置如下

clf = DecisionTreeClassifier(criterion='entropy',random_state = 25,max_depth=7,splitter='random',min_samples_leaf=4,min_samples_split=15);

************************************************************************************************************************************************************************************

5、集成算法：随机森林

5.1 啥参数也不调，看下随机森林算法的模型精确度

rfc = RFC(random_state= 100);
cross_val_score(rfc, titanic_feature, titanic_target, cv=10).mean()

可以看出，在什么参数都不调的情况下，随机森林算法比单颗决策树算法高了差不多5%左右

5.2 调整参数n_estimators

（1）初调

# 调整n_estimators
score = [];
for i in np.arange(10,200,10):rfc = RFC(n_estimators=i， random_state = 100);score_ = cross_val_score(rfc, titanic_feature, titanic_target, cv=10).mean();score.append(score_);# 找出最大的Score和对应的n_estimators
maxScore = max(score);
Iter = np.arange(10,200,10);
maxIter = Iter[score.index(maxScore)];
print (maxScore);
print (maxIter);# 画出n_estimators的学习曲线
plt.figure();
plt.plot(Iter, score);
plt.show()

可以看出，在n_estimators为140左右时，会有峰值

（2）细调

score = [];
for i in np.arange(130,150):rfc = RFC(n_estimators=i);score_ = cross_val_score(rfc, titanic_feature, titanic_target, cv=10).mean();score.append(score_);# 找出最大的Score和对应的n_estimators
maxScore = max(score);
Iter = np.arange(130,150);
maxIter = Iter[score.index(maxScore)];
print (maxScore);
print (maxIter);# 画出n_estimators的学习曲线
plt.figure();
plt.plot(Iter, score);
plt.show()

因此，在此次实验中，我们设定n_estimators为131

5.3 调整参数max_depth

param_grid = {'max_depth':np.arange(1,20)}
rfc = RandomForestClassifier(n_estimators=131,random_state=100)
GS = GridSearchCV(rfc, param_grid, cv=10);
GS.fit(titanic_feature, titanic_target);print (GS.best_params_);
print (GS.best_score_)

5.4 调整min_samples_leaf和min_sample_split这两个参数

param_grid = {'min_samples_leaf':[*range(1,20,1)],'min_samples_split':[*range(2,20,1)]};
rfc = RFC(n_estimators=131,random_state=100,max_depth=8);GS = GridSearchCV(rfc, param_grid, cv=10);
GS.fit(titanic_feature, titanic_target);print (GS.best_params_);
print (GS.best_score_)

可以看到，最佳参数配置如上图所示

5.5 调整max_features

由前面的调整可以看出，模型现在处在过拟合状态，所以势必要减少选择的特征数，才能是模型向着欠拟合的方向去移动。在默认状态下，随机森林选择的特征个数为

，其中n表示特征的总个数。在本案例中，n为10，所以默认状态下随机森林的特征选择应该为4。又由于现在模型是过拟合状态，所以可以尝试减小特征选择的个数，使模型向着欠拟合的状态过度。

但是我们发现，固定max_features的时候，分数反而变低了，所以max_features这个参数不动为好。
因此，最终的参数选择如下

rfc = RFC(n_estimators=131,random_state=100,max_depth=8,min_samples_leaf=2,min_impurity_split=2,criterion='gini')

准确率为84.74%

************************************************************************************************************************************************************************************

6、逻辑回归算法

6.1 啥参数也不调，看下逻辑回归模型的效果

from sklearn.linear_model import LogisticRegression as LR
from sklearn.preprocessing import StandardScaler# 先标准化
scaler = StandardScaler();
X_std = scaler.fit_transform(titanic_feature);
y = titanic_target;lr = LR(random_state= 100);
score = cross_val_score(lr, X_std, y, cv=10).mean()

6.2 因为不清楚特征是否与标签之间是线性关系，可以尝试下各种Solver

Solver = ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga'];
score = [];
for solver in Solver:lr = LR(solver=solver, random_state=200);score.append(cross_val_score(lr, titanic_feature, titanic_target, cv=10).mean());score

我慌了，貌似对这个小数据集，哪种solver取得的效果都一样~~

6.3 调整超参数C

score = [];
C_iter = np.linspace(0.01, 0.2, 20);
for i in C_iter:lr = LR(solver='liblinear', C=i, random_state=100);score.append(cross_val_score(lr, X_std, y, cv=10).mean());plt.figure();
plt.plot(C_iter, score);
plt.show()maxIndex = score.index(max(score));
C_max = C_iter[maxIndex];

可以看出，当C为0.09时，模型的精度为0.7958，相比于上面提高了0.0001

6.4 调整参数max_iter

score = [];
Max_iter = np.arange(1,11);for i in Max_iter:lr = LR(solver='liblinear', C=0.09, random_state=100, max_iter=i);score.append(cross_val_score(lr, X_std, y, cv=10).mean());plt.figure();
plt.plot(Max_iter, score);
plt.show();

可以看出，相对于前面没有调整，模型精度没有变化