特征工程和数据预处理常用工具和方法

import pandas as pdtrain_data = pd.read_csv("train.csv")train_data.shape   #应该是给了property
(891, 12)train_data.describe()train_data["Age"].fillna(value=train_data["Age"].mean())ter
from sklearn.preprocessing import Imputerclass
help(Imputer)   #这是个class#axis=0指定填充列，1指定填充行，初始化imputer类
imp = Imputer(missing_values='NaN',strategy='mean',axis=0) #axis=0指定填充列，1指定填充行，初始化imputer类e =
age = imp.fit_transform(train_data[["Age"]].values)  #fit_transform 两个步骤，fit读取数据计算，transform完成填充。如果只要拟合就用fit。train_data.loc[:,"Age"] = train_data["Age"].fillna(value=train_data["Age"].mean()) #[x,y] x行，y列。:代表所有。把右边填充好的赋值给左边。train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB操作
#常见的工程操作#数值型
#数值型##幅度变换)
#取对数
import numpy as np
log_age = train_data["Age"].apply(lambda x: np.log(x))  #Applies function along input axis of DataFrame.DataFrame.apply(func, axis=0)#最大最小值缩放  公式：xnorm = (x - xmin)/(xmax - xmin)  归一化
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
fare_mm = mm_scaler.fit_transform(train_data[["Fare"]])#标准化缩放 standardscaler   xstand = (x - u)/σ  u平均值 σ标准差
from sklearn.preprocessing import StandardScaler
sds = StandardScaler()
fare_sds = sds.fit_transform(train_data[["Fare"]])#1.统计值
max_age = train_data["Age"].max()
min_age = train_data["Age"].min()#分位数
age_quarter_1 = train_data["Age"].quantile(0.25)age_quarter_1
22.0eatures
#高次特征和交叉特征
from sklearn.preprocessing import PolynomialFeatures
pnf = PolynomialFeatures(degree = 2) #degree 多项式的阶数，一般默认是２。
age_pnf = pnf.fit_transform(train_data[["SibSp","Parch"]])六列
age_pnf   #[1,a,b,a^2,a*b,b^] 六列
array([[ 1.,  1.,  0.,  1.,  0.,  0.],[ 1.,  1.,  0.,  1.,  0.,  0.],[ 1.,  0.,  0.,  0.,  0.,  0.],..., [ 1.,  1.,  2.,  1.,  2.,  4.],[ 1.,  0.,  0.,  0.,  0.,  0.],[ 1.,  0.,  0.,  0.,  0.,  0.]])#离散化，分箱，分桶。把值平均分配。cut和qcut
#cut 等距切分，1-100岁等距切分4分就是0-25是一个箱，26-50是一个。然后年龄落入哪个箱就划分进去。
train_data.loc[:,"Fare_cut"] = pd.cut(train_data["Fare"],5)  #五个分割成四个区间。train_data.head()
PassengerId Survived    Pclass  Name    Sex Age SibSp   Parch   Ticket  Fare    Cabin   Embarked    Fare_cut
0   1   0   3   Braund, Mr. Owen Harris male    22.0    1   0   A/5 21171   7.2500  NaN S   (-0.512, 102.466]
1   2   1   1   Cumings, Mrs. John Bradley (Florence Briggs Th...   female  38.0    1   0   PC 17599    71.2833 C85 C   (-0.512, 102.466]
2   3   1   3   Heikkinen, Miss. Laina  female  26.0    0   0   STON/O2. 3101282    7.9250  NaN S   (-0.512, 102.466]
3   4   1   1   Futrelle, Mrs. Jacques Heath (Lily May Peel)    female  35.0    1   0   113803  53.1000 C123    S   (-0.512, 102.466]
4   5   0   3   Allen, Mr. William Henry    male    35.0    0   0   373450  8.0500  NaN S   (-0.512, 102.466]train_data["Fare_cut"].unique()
[(-0.512, 102.466], (204.932, 307.398], (102.466, 204.932], (409.863, 512.329]]
Categories (4, object): [(-0.512, 102.466] < (102.466, 204.932] < (204.932, 307.398] < (409.863, 512.329]],5
#等频切分 qcut 按照频率去切分，让每个区间中的数目一样，频率一样。
train_data.loc[:,"Fare_qcut"] = pd.qcut(train_data["Fare"],5)
train_data["Fare_qcut"].unique()
[[0, 7.854], (39.688, 512.329], (7.854, 10.5], (10.5, 21.679], (21.679, 39.688]]
Categories (5, object): [[0, 7.854] < (7.854, 10.5] < (10.5, 21.679] < (21.679, 39.688] < (39.688, 512.329]]#one hot encoding 独热向量编码  但是会稀释样本特征，造成数据量增大
embarked_ohe = pd.get_dummies(train_data[['Embarked']])embarked_ohe.head()
C   Q   S
0   0   0   1
1   1   0   0
2   0   0   1
3   0   0   1
4   0   0   1fareqcut_ohe = pd.get_dummies(train_data["Fare_qcut"])fareqcut_ohe.head()
[0, 7.854]  (7.854, 10.5]   (10.5, 21.679]  (21.679, 39.688]    (39.688, 512.329]
0   1   0   0   0   0
1   0   0   0   0   1
2   0   1   0   0   0
3   0   0   0   0   1
4   0   1   0   0   0对日期处理
#时间型的特征处理 对日期处理car_sales = pd.read_csv("")

import pandas as pddf_train = pd.read_csv('train.csv')
df_train.head(10)
PassengerId Survived    Pclass  Name    Sex Age SibSp   Parch   Ticket  Fare    Cabin   Embarked
0   1   0   3   Braund, Mr. Owen Harris male    22.0    1   0   A/5 21171   7.2500  NaN S
1   2   1   1   Cumings, Mrs. John Bradley (Florence Briggs Th...   female  38.0    1   0   PC 17599    71.2833 C85 C
2   3   1   3   Heikkinen, Miss. Laina  female  26.0    0   0   STON/O2. 3101282    7.9250  NaN S
3   4   1   1   Futrelle, Mrs. Jacques Heath (Lily May Peel)    female  35.0    1   0   113803  53.1000 C123    S
4   5   0   3   Allen, Mr. William Henry    male    35.0    0   0   373450  8.0500  NaN S
5   6   0   3   Moran, Mr. James    male    NaN 0   0   330877  8.4583  NaN Q
6   7   0   1   McCarthy, Mr. Timothy J male    54.0    0   0   17463   51.8625 E46 S
7   8   0   3   Palsson, Master. Gosta Leonard  male    2.0 3   1   349909  21.0750 NaN S
8   9   1   3   Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)   female  27.0    0   2   347742  11.1333 NaN S
9   10  1   2   Nasser, Mrs. Nicholas (Adele Achem) female  14.0    1   0   237736  30.0708 NaN Cdf_train.describe()
PassengerId Survived    Pclass  Age SibSp   Parch   Fare
count   891.000000  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean    446.000000  0.383838    2.308642    29.699118   0.523008    0.381594    32.204208
std 257.353842  0.486592    0.836071    14.526497   1.102743    0.806057    49.693429
min 1.000000    0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25% 223.500000  0.000000    2.000000    20.125000   0.000000    0.000000    7.910400
50% 446.000000  0.000000    3.000000    28.000000   0.000000    0.000000    14.454200
75% 668.500000  1.000000    3.000000    38.000000   1.000000    0.000000    31.000000
max 891.000000  1.000000    3.000000    80.000000   8.000000    6.000000    512.329200df_train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KBdf_train[['Age']].values
array([[ 22.  ],[ 38.  ],[ 26.  ],[ 35.  ],[ 35.  ],[   nan],[ 54.  ],[  2.  ],[ 27.  ],[ 14.  ],[  4.  ],[ 58.  ],[ 20.  ],[ 39.  ],[ 14.  ],[ 55.  ],[  2.  ],[   nan],[ 31.  ],[   nan],[ 35.  ],[ 34.  ],[ 15.  ],[ 28.  ],[  8.  ],[ 38.  ],[   nan],[ 19.  ],[   nan],[   nan],[ 40.  ],[   nan],[   nan],[ 66.  ],[ 28.  ],[ 42.  ],[   nan],[ 21.  ],[ 18.  ],[ 14.  ],[ 40.  ],[ 27.  ],[   nan],[  3.  ],[ 19.  ],[   nan],[   nan],[   nan],[   nan],[ 18.  ],[  7.  ],[ 21.  ],[ 49.  ],[ 29.  ],[ 65.  ],[   nan],[ 21.  ],[ 28.5 ],[  5.  ],[ 11.  ],[ 22.  ],[ 38.  ],[ 45.  ],[  4.  ],[   nan],[   nan],[ 29.  ],[ 19.  ],[ 17.  ],[ 26.  ],[ 32.  ],[ 16.  ],[ 21.  ],[ 26.  ],[ 32.  ],[ 25.  ],[   nan],[   nan],[  0.83],[ 30.  ],[ 22.  ],[ 29.  ],[   nan],[ 28.  ],[ 17.  ],[ 33.  ],[ 16.  ],[   nan],[ 23.  ],[ 24.  ],[ 29.  ],[ 20.  ],[ 46.  ],[ 26.  ],[ 59.  ],[   nan],[ 71.  ],[ 23.  ],[ 34.  ],[ 34.  ],[ 28.  ],[   nan],[ 21.  ],[ 33.  ],[ 37.  ],[ 28.  ],[ 21.  ],[   nan],[ 38.  ],[   nan],[ 47.  ],[ 14.5 ],[ 22.  ],[ 20.  ],[ 17.  ],[ 21.  ],[ 70.5 ],[ 29.  ],[ 24.  ],[  2.  ],[ 21.  ],[   nan],[ 32.5 ],[ 32.5 ],[ 54.  ],[ 12.  ],[   nan],[ 24.  ],[   nan],[ 45.  ],[ 33.  ],[ 20.  ],[ 47.  ],[ 29.  ],[ 25.  ],[ 23.  ],[ 19.  ],[ 37.  ],[ 16.  ],[ 24.  ],[   nan],[ 22.  ],[ 24.  ],[ 19.  ],[ 18.  ],[ 19.  ],[ 27.  ],[  9.  ],[ 36.5 ],[ 42.  ],[ 51.  ],[ 22.  ],[ 55.5 ],[ 40.5 ],[   nan],[ 51.  ],[ 16.  ],[ 30.  ],[   nan],[   nan],[ 44.  ],[ 40.  ],[ 26.  ],[ 17.  ],[  1.  ],[  9.  ],[   nan],[ 45.  ],[   nan],[ 28.  ],[ 61.  ],[  4.  ],[  1.  ],[ 21.  ],[ 56.  ],[ 18.  ],[   nan],[ 50.  ],[ 30.  ],[ 36.  ],[   nan],[   nan],[  9.  ],[  1.  ],[  4.  ],[   nan],[   nan],[ 45.  ],[ 40.  ],[ 36.  ],[ 32.  ],[ 19.  ],[ 19.  ],[  3.  ],[ 44.  ],[ 58.  ],[   nan],[ 42.  ],[   nan],[ 24.  ],[ 28.  ],[   nan],[ 34.  ],[ 45.5 ],[ 18.  ],[  2.  ],[ 32.  ],[ 26.  ],[ 16.  ],[ 40.  ],[ 24.  ],[ 35.  ],[ 22.  ],[ 30.  ],[   nan],[ 31.  ],[ 27.  ],[ 42.  ],[ 32.  ],[ 30.  ],[ 16.  ],[ 27.  ],[ 51.  ],[   nan],[ 38.  ],[ 22.  ],[ 19.  ],[ 20.5 ],[ 18.  ],[   nan],[ 35.  ],[ 29.  ],[ 59.  ],[  5.  ],[ 24.  ],[   nan],[ 44.  ],[  8.  ],[ 19.  ],[ 33.  ],[   nan],[   nan],[ 29.  ],[ 22.  ],[ 30.  ],[ 44.  ],[ 25.  ],[ 24.  ],[ 37.  ],[ 54.  ],[   nan],[ 29.  ],[ 62.  ],[ 30.  ],[ 41.  ],[ 29.  ],[   nan],[ 30.  ],[ 35.  ],[ 50.  ],[   nan],[  3.  ],[ 52.  ],[ 40.  ],[   nan],[ 36.  ],[ 16.  ],[ 25.  ],[ 58.  ],[ 35.  ],[   nan],[ 25.  ],[ 41.  ],[ 37.  ],[   nan],[ 63.  ],[ 45.  ],[   nan],[  7.  ],[ 35.  ],[ 65.  ],[ 28.  ],[ 16.  ],[ 19.  ],[   nan],[ 33.  ],[ 30.  ],[ 22.  ],[ 42.  ],[ 22.  ],[ 26.  ],[ 19.  ],[ 36.  ],[ 24.  ],[ 24.  ],[   nan],[ 23.5 ],[  2.  ],[   nan],[ 50.  ],[   nan],[   nan],[ 19.  ],[   nan],[   nan],[  0.92],[   nan],[ 17.  ],[ 30.  ],[ 30.  ],[ 24.  ],[ 18.  ],[ 26.  ],[ 28.  ],[ 43.  ],[ 26.  ],[ 24.  ],[ 54.  ],[ 31.  ],[ 40.  ],[ 22.  ],[ 27.  ],[ 30.  ],[ 22.  ],[   nan],[ 36.  ],[ 61.  ],[ 36.  ],[ 31.  ],[ 16.  ],[   nan],[ 45.5 ],[ 38.  ],[ 16.  ],[   nan],[   nan],[ 29.  ],[ 41.  ],[ 45.  ],[ 45.  ],[  2.  ],[ 24.  ],[ 28.  ],[ 25.  ],[ 36.  ],[ 24.  ],[ 40.  ],[   nan],[  3.  ],[ 42.  ],[ 23.  ],[   nan],[ 15.  ],[ 25.  ],[   nan],[ 28.  ],[ 22.  ],[ 38.  ],[   nan],[   nan],[ 40.  ],[ 29.  ],[ 45.  ],[ 35.  ],[   nan],[ 30.  ],[ 60.  ],[   nan],[   nan],[ 24.  ],[ 25.  ],[ 18.  ],[ 19.  ],[ 22.  ],[  3.  ],[   nan],[ 22.  ],[ 27.  ],[ 20.  ],[ 19.  ],[ 42.  ],[  1.  ],[ 32.  ],[ 35.  ],[   nan],[ 18.  ],[  1.  ],[ 36.  ],[   nan],[ 17.  ],[ 36.  ],[ 21.  ],[ 28.  ],[ 23.  ],[ 24.  ],[ 22.  ],[ 31.  ],[ 46.  ],[ 23.  ],[ 28.  ],[ 39.  ],[ 26.  ],[ 21.  ],[ 28.  ],[ 20.  ],[ 34.  ],[ 51.  ],[  3.  ],[ 21.  ],[   nan],[   nan],[   nan],[ 33.  ],[   nan],[ 44.  ],[   nan],[ 34.  ],[ 18.  ],[ 30.  ],[ 10.  ],[   nan],[ 21.  ],[ 29.  ],[ 28.  ],[ 18.  ],[   nan],[ 28.  ],[ 19.  ],[   nan],[ 32.  ],[ 28.  ],[   nan],[ 42.  ],[ 17.  ],[ 50.  ],[ 14.  ],[ 21.  ],[ 24.  ],[ 64.  ],[ 31.  ],[ 45.  ],[ 20.  ],[ 25.  ],[ 28.  ],[   nan],[  4.  ],[ 13.  ],[ 34.  ],[  5.  ],[ 52.  ],[ 36.  ],[   nan],[ 30.  ],[ 49.  ],[   nan],[ 29.  ],[ 65.  ],[   nan],[ 50.  ],[   nan],[ 48.  ],[ 34.  ],[ 47.  ],[ 48.  ],[   nan],[ 38.  ],[   nan],[ 56.  ],[   nan],[  0.75],[   nan],[ 38.  ],[ 33.  ],[ 23.  ],[ 22.  ],[   nan],[ 34.  ],[ 29.  ],[ 22.  ],[  2.  ],[  9.  ],[   nan],[ 50.  ],[ 63.  ],[ 25.  ],[   nan],[ 35.  ],[ 58.  ],[ 30.  ],[  9.  ],[   nan],[ 21.  ],[ 55.  ],[ 71.  ],[ 21.  ],[   nan],[ 54.  ],[   nan],[ 25.  ],[ 24.  ],[ 17.  ],[ 21.  ],[   nan],[ 37.  ],[ 16.  ],[ 18.  ],[ 33.  ],[   nan],[ 28.  ],[ 26.  ],[ 29.  ],[   nan],[ 36.  ],[ 54.  ],[ 24.  ],[ 47.  ],[ 34.  ],[   nan],[ 36.  ],[ 32.  ],[ 30.  ],[ 22.  ],[   nan],[ 44.  ],[   nan],[ 40.5 ],[ 50.  ],[   nan],[ 39.  ],[ 23.  ],[  2.  ],[   nan],[ 17.  ],[   nan],[ 30.  ],[  7.  ],[ 45.  ],[ 30.  ],[   nan],[ 22.  ],[ 36.  ],[  9.  ],[ 11.  ],[ 32.  ],[ 50.  ],[ 64.  ],[ 19.  ],[   nan],[ 33.  ],[  8.  ],[ 17.  ],[ 27.  ],[   nan],[ 22.  ],[ 22.  ],[ 62.  ],[ 48.  ],[   nan],[ 39.  ],[ 36.  ],[   nan],[ 40.  ],[ 28.  ],[   nan],[   nan],[ 24.  ],[ 19.  ],[ 29.  ],[   nan],[ 32.  ],[ 62.  ],[ 53.  ],[ 36.  ],[   nan],[ 16.  ],[ 19.  ],[ 34.  ],[ 39.  ],[   nan],[ 32.  ],[ 25.  ],[ 39.  ],[ 54.  ],[ 36.  ],[   nan],[ 18.  ],[ 47.  ],[ 60.  ],[ 22.  ],[   nan],[ 35.  ],[ 52.  ],[ 47.  ],[   nan],[ 37.  ],[ 36.  ],[   nan],[ 49.  ],[   nan],[ 49.  ],[ 24.  ],[   nan],[   nan],[ 44.  ],[ 35.  ],[ 36.  ],[ 30.  ],[ 27.  ],[ 22.  ],[ 40.  ],[ 39.  ],[   nan],[   nan],[   nan],[ 35.  ],[ 24.  ],[ 34.  ],[ 26.  ],[  4.  ],[ 26.  ],[ 27.  ],[ 42.  ],[ 20.  ],[ 21.  ],[ 21.  ],[ 61.  ],[ 57.  ],[ 21.  ],[ 26.  ],[   nan],[ 80.  ],[ 51.  ],[ 32.  ],[   nan],[  9.  ],[ 28.  ],[ 32.  ],[ 31.  ],[ 41.  ],[   nan],[ 20.  ],[ 24.  ],[  2.  ],[   nan],[  0.75],[ 48.  ],[ 19.  ],[ 56.  ],[   nan],[ 23.  ],[   nan],[ 18.  ],[ 21.  ],[   nan],[ 18.  ],[ 24.  ],[   nan],[ 32.  ],[ 23.  ],[ 58.  ],[ 50.  ],[ 40.  ],[ 47.  ],[ 36.  ],[ 20.  ],[ 32.  ],[ 25.  ],[   nan],[ 43.  ],[   nan],[ 40.  ],[ 31.  ],[ 70.  ],[ 31.  ],[   nan],[ 18.  ],[ 24.5 ],[ 18.  ],[ 43.  ],[ 36.  ],[   nan],[ 27.  ],[ 20.  ],[ 14.  ],[ 60.  ],[ 25.  ],[ 14.  ],[ 19.  ],[ 18.  ],[ 15.  ],[ 31.  ],[  4.  ],[   nan],[ 25.  ],[ 60.  ],[ 52.  ],[ 44.  ],[   nan],[ 49.  ],[ 42.  ],[ 18.  ],[ 35.  ],[ 18.  ],[ 25.  ],[ 26.  ],[ 39.  ],[ 45.  ],[ 42.  ],[ 22.  ],[   nan],[ 24.  ],[   nan],[ 48.  ],[ 29.  ],[ 52.  ],[ 19.  ],[ 38.  ],[ 27.  ],[   nan],[ 33.  ],[  6.  ],[ 17.  ],[ 34.  ],[ 50.  ],[ 27.  ],[ 20.  ],[ 30.  ],[   nan],[ 25.  ],[ 25.  ],[ 29.  ],[ 11.  ],[   nan],[ 23.  ],[ 23.  ],[ 28.5 ],[ 48.  ],[ 35.  ],[   nan],[   nan],[   nan],[ 36.  ],[ 21.  ],[ 24.  ],[ 31.  ],[ 70.  ],[ 16.  ],[ 30.  ],[ 19.  ],[ 31.  ],[  4.  ],[  6.  ],[ 33.  ],[ 23.  ],[ 48.  ],[  0.67],[ 28.  ],[ 18.  ],[ 34.  ],[ 33.  ],[   nan],[ 41.  ],[ 20.  ],[ 36.  ],[ 16.  ],[ 51.  ],[   nan],[ 30.5 ],[   nan],[ 32.  ],[ 24.  ],[ 48.  ],[ 57.  ],[   nan],[ 54.  ],[ 18.  ],[   nan],[  5.  ],[   nan],[ 43.  ],[ 13.  ],[ 17.  ],[ 29.  ],[   nan],[ 25.  ],[ 25.  ],[ 18.  ],[  8.  ],[  1.  ],[ 46.  ],[   nan],[ 16.  ],[   nan],[   nan],[ 25.  ],[ 39.  ],[ 49.  ],[ 31.  ],[ 30.  ],[ 30.  ],[ 34.  ],[ 31.  ],[ 11.  ],[  0.42],[ 27.  ],[ 31.  ],[ 39.  ],[ 18.  ],[ 39.  ],[ 33.  ],[ 26.  ],[ 39.  ],[ 35.  ],[  6.  ],[ 30.5 ],[   nan],[ 23.  ],[ 31.  ],[ 43.  ],[ 10.  ],[ 52.  ],[ 27.  ],[ 38.  ],[ 27.  ],[  2.  ],[   nan],[   nan],[  1.  ],[   nan],[ 62.  ],[ 15.  ],[  0.83],[   nan],[ 23.  ],[ 18.  ],[ 39.  ],[ 21.  ],[   nan],[ 32.  ],[   nan],[ 20.  ],[ 16.  ],[ 30.  ],[ 34.5 ],[ 17.  ],[ 42.  ],[   nan],[ 35.  ],[ 28.  ],[   nan],[  4.  ],[ 74.  ],[  9.  ],[ 16.  ],[ 44.  ],[ 18.  ],[ 45.  ],[ 51.  ],[ 24.  ],[   nan],[ 41.  ],[ 21.  ],[ 48.  ],[   nan],[ 24.  ],[ 42.  ],[ 27.  ],[ 31.  ],[   nan],[  4.  ],[ 26.  ],[ 47.  ],[ 33.  ],[ 47.  ],[ 28.  ],[ 15.  ],[ 20.  ],[ 19.  ],[   nan],[ 56.  ],[ 25.  ],[ 33.  ],[ 22.  ],[ 28.  ],[ 25.  ],[ 39.  ],[ 27.  ],[ 19.  ],[   nan],[ 26.  ],[ 32.  ]])from sklearn.preprocessing import Imputerhelp(Imputer)
Help on class Imputer in module sklearn.preprocessing.imputation:class Imputer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)|  Imputation transformer for completing missing values.|  |  Read more in the :ref:`User Guide <imputation>`.|  |  Parameters|  ----------|  missing_values : integer or "NaN", optional (default="NaN")|      The placeholder for the missing values. All occurrences of|      `missing_values` will be imputed. For missing values encoded as np.nan,|      use the string value "NaN".|  |  strategy : string, optional (default="mean")|      The imputation strategy.|  |      - If "mean", then replace missing values using the mean along|        the axis.|      - If "median", then replace missing values using the median along|        the axis.|      - If "most_frequent", then replace missing using the most frequent|        value along the axis.|  |  axis : integer, optional (default=0)|      The axis along which to impute.|  |      - If `axis=0`, then impute along columns.|      - If `axis=1`, then impute along rows.|  |  verbose : integer, optional (default=0)|      Controls the verbosity of the imputer.|  |  copy : boolean, optional (default=True)|      If True, a copy of X will be created. If False, imputation will|      be done in-place whenever possible. Note that, in the following cases,|      a new copy will always be made, even if `copy=False`:|  |      - If X is not an array of floating values;|      - If X is sparse and `missing_values=0`;|      - If `axis=0` and X is encoded as a CSR matrix;|      - If `axis=1` and X is encoded as a CSC matrix.|  |  Attributes|  ----------|  statistics_ : array of shape (n_features,)|      The imputation fill value for each feature if axis == 0.|  |  Notes|  -----|  - When ``axis=0``, columns which only contained missing values at `fit`|    are discarded upon `transform`.|  - When ``axis=1``, an exception is raised if there are rows for which it is|    not possible to fill in the missing values (e.g., because they only|    contain missing values).|  |  Method resolution order:|      Imputer|      sklearn.base.BaseEstimator|      sklearn.base.TransformerMixin|      builtins.object|  |  Methods defined here:|  |  __init__(self, missing_values='NaN', strategy='mean', axis=0, verbose=0, copy=True)|      Initialize self.  See help(type(self)) for accurate signature.|  |  fit(self, X, y=None)|      Fit the imputer on X.|      |      Parameters|      ----------|      X : {array-like, sparse matrix}, shape (n_samples, n_features)|          Input data, where ``n_samples`` is the number of samples and|          ``n_features`` is the number of features.|      |      Returns|      -------|      self : object|          Returns self.|  |  transform(self, X)|      Impute all missing values in X.|      |      Parameters|      ----------|      X : {array-like, sparse matrix}, shape = [n_samples, n_features]|          The input data to complete.|  |  ----------------------------------------------------------------------|  Methods inherited from sklearn.base.BaseEstimator:|  |  __getstate__(self)|  |  __repr__(self)|      Return repr(self).|  |  __setstate__(self, state)|  |  get_params(self, deep=True)|      Get parameters for this estimator.|      |      Parameters|      ----------|      deep : boolean, optional|          If True, will return the parameters for this estimator and|          contained subobjects that are estimators.|      |      Returns|      -------|      params : mapping of string to any|          Parameter names mapped to their values.|  |  set_params(self, **params)|      Set the parameters of this estimator.|      |      The method works on simple estimators as well as on nested objects|      (such as pipelines). The latter have parameters of the form|      ``<component>__<parameter>`` so that it's possible to update each|      component of a nested object.|      |      Returns|      -------|      self|  |  ----------------------------------------------------------------------|  Data descriptors inherited from sklearn.base.BaseEstimator:|  |  __dict__|      dictionary for instance variables (if defined)|  |  __weakref__|      list of weak references to the object (if defined)|  |  ----------------------------------------------------------------------|  Methods inherited from sklearn.base.TransformerMixin:|  |  fit_transform(self, X, y=None, **fit_params)|      Fit to data, then transform it.|      |      Fits transformer to X and y with optional parameters fit_params|      and returns a transformed version of X.|      |      Parameters|      ----------|      X : numpy array of shape [n_samples, n_features]|          Training set.|      |      y : numpy array of shape [n_samples]|          Target values.|      |      Returns|      -------|      X_new : numpy array of shape [n_samples, n_features_new]|          Transformed array.impu = Imputer(missing_values='NaN',strategy='mean',axis=0)
age =impu.fit_transform(df_train[['Age']]) #df_obj.values 查看数据值 需要dataframe格式  .values有没有都一样
age
array([[ 22.        ],[ 38.        ],[ 26.        ],[ 35.        ],[ 35.        ],[ 29.69911765],[ 54.        ],[  2.        ],[ 27.        ],[ 14.        ],[  4.        ],[ 58.        ],[ 20.        ],[ 39.        ],[ 14.        ],[ 55.        ],[  2.        ],[ 29.69911765],[ 31.        ],[ 29.69911765],[ 35.        ],[ 34.        ],[ 15.        ],[ 28.        ],[  8.        ],[ 38.        ],[ 29.69911765],[ 19.        ],[ 29.69911765],[ 29.69911765],[ 40.        ],[ 29.69911765],[ 29.69911765],[ 66.        ],[ 28.        ],[ 42.        ],[ 29.69911765],[ 21.        ],[ 18.        ],[ 14.        ],[ 40.        ],[ 27.        ],[ 29.69911765],[  3.        ],[ 19.        ],[ 29.69911765],[ 29.69911765],[ 29.69911765],[ 29.69911765],[ 18.        ],[  7.        ],[ 21.        ],[ 49.        ],[ 29.        ],[ 65.        ],[ 29.69911765],[ 21.        ],[ 28.5       ],[  5.        ],[ 11.        ],[ 22.        ],[ 38.        ],[ 45.        ],[  4.        ],[ 29.69911765],[ 29.69911765],[ 29.        ],[ 19.        ],[ 17.        ],[ 26.        ],[ 32.        ],[ 16.        ],[ 21.        ],[ 26.        ],[ 32.        ],[ 25.        ],[ 29.69911765],[ 29.69911765],[  0.83      ],[ 30.        ],[ 22.        ],[ 29.        ],[ 29.69911765],[ 28.        ],[ 17.        ],[ 33.        ],[ 16.        ],[ 29.69911765],[ 23.        ],[ 24.        ],[ 29.        ],[ 20.        ],[ 46.        ],[ 26.        ],[ 59.        ],[ 29.69911765],[ 71.        ],[ 23.        ],[ 34.        ],[ 34.        ],[ 28.        ],[ 29.69911765],[ 21.        ],[ 33.        ],[ 37.        ],[ 28.        ],[ 21.        ],[ 29.69911765],[ 38.        ],[ 29.69911765],[ 47.        ],[ 14.5       ],[ 22.        ],[ 20.        ],[ 17.        ],[ 21.        ],[ 70.5       ],[ 29.        ],[ 24.        ],[  2.        ],[ 21.        ],[ 29.69911765],[ 32.5       ],[ 32.5       ],[ 54.        ],[ 12.        ],[ 29.69911765],[ 24.        ],[ 29.69911765],[ 45.        ],[ 33.        ],[ 20.        ],[ 47.        ],[ 29.        ],[ 25.        ],[ 23.        ],[ 19.        ],[ 37.        ],[ 16.        ],[ 24.        ],[ 29.69911765],[ 22.        ],[ 24.        ],[ 19.        ],[ 18.        ],[ 19.        ],[ 27.        ],[  9.        ],[ 36.5       ],[ 42.        ],[ 51.        ],[ 22.        ],[ 55.5       ],[ 40.5       ],[ 29.69911765],[ 51.        ],[ 16.        ],[ 30.        ],[ 29.69911765],[ 29.69911765],[ 44.        ],[ 40.        ],[ 26.        ],[ 17.        ],[  1.        ],[  9.        ],[ 29.69911765],[ 45.        ],[ 29.69911765],[ 28.        ],[ 61.        ],[  4.        ],[  1.        ],[ 21.        ],[ 56.        ],[ 18.        ],[ 29.69911765],[ 50.        ],[ 30.        ],[ 36.        ],[ 29.69911765],[ 29.69911765],[  9.        ],[  1.        ],[  4.        ],[ 29.69911765],[ 29.69911765],[ 45.        ],[ 40.        ],[ 36.        ],[ 32.        ],[ 19.        ],[ 19.        ],[  3.        ],[ 44.        ],[ 58.        ],[ 29.69911765],[ 42.        ],[ 29.69911765],[ 24.        ],[ 28.        ],[ 29.69911765],[ 34.        ],[ 45.5       ],[ 18.        ],[  2.        ],[ 32.        ],[ 26.        ],[ 16.        ],[ 40.        ],[ 24.        ],[ 35.        ],[ 22.        ],[ 30.        ],[ 29.69911765],[ 31.        ],[ 27.        ],[ 42.        ],[ 32.        ],[ 30.        ],[ 16.        ],[ 27.        ],[ 51.        ],[ 29.69911765],[ 38.        ],[ 22.        ],[ 19.        ],[ 20.5       ],[ 18.        ],[ 29.69911765],[ 35.        ],[ 29.        ],[ 59.        ],[  5.        ],[ 24.        ],[ 29.69911765],[ 44.        ],[  8.        ],[ 19.        ],[ 33.        ],[ 29.69911765],[ 29.69911765],[ 29.        ],[ 22.        ],[ 30.        ],[ 44.        ],[ 25.        ],[ 24.        ],[ 37.        ],[ 54.        ],[ 29.69911765],[ 29.        ],[ 62.        ],[ 30.        ],[ 41.        ],[ 29.        ],[ 29.69911765],[ 30.        ],[ 35.        ],[ 50.        ],[ 29.69911765],[  3.        ],[ 52.        ],[ 40.        ],[ 29.69911765],[ 36.        ],[ 16.        ],[ 25.        ],[ 58.        ],[ 35.        ],[ 29.69911765],[ 25.        ],[ 41.        ],[ 37.        ],[ 29.69911765],[ 63.        ],[ 45.        ],[ 29.69911765],[  7.        ],[ 35.        ],[ 65.        ],[ 28.        ],[ 16.        ],[ 19.        ],[ 29.69911765],[ 33.        ],[ 30.        ],[ 22.        ],[ 42.        ],[ 22.        ],[ 26.        ],[ 19.        ],[ 36.        ],[ 24.        ],[ 24.        ],[ 29.69911765],[ 23.5       ],[  2.        ],[ 29.69911765],[ 50.        ],[ 29.69911765],[ 29.69911765],[ 19.        ],[ 29.69911765],[ 29.69911765],[  0.92      ],[ 29.69911765],[ 17.        ],[ 30.        ],[ 30.        ],[ 24.        ],[ 18.        ],[ 26.        ],[ 28.        ],[ 43.        ],[ 26.        ],[ 24.        ],[ 54.        ],[ 31.        ],[ 40.        ],[ 22.        ],[ 27.        ],[ 30.        ],[ 22.        ],[ 29.69911765],[ 36.        ],[ 61.        ],[ 36.        ],[ 31.        ],[ 16.        ],[ 29.69911765],[ 45.5       ],[ 38.        ],[ 16.        ],[ 29.69911765],[ 29.69911765],[ 29.        ],[ 41.        ],[ 45.        ],[ 45.        ],[  2.        ],[ 24.        ],[ 28.        ],[ 25.        ],[ 36.        ],[ 24.        ],[ 40.        ],[ 29.69911765],[  3.        ],[ 42.        ],[ 23.        ],[ 29.69911765],[ 15.        ],[ 25.        ],[ 29.69911765],[ 28.        ],[ 22.        ],[ 38.        ],[ 29.69911765],[ 29.69911765],[ 40.        ],[ 29.        ],[ 45.        ],[ 35.        ],[ 29.69911765],[ 30.        ],[ 60.        ],[ 29.69911765],[ 29.69911765],[ 24.        ],[ 25.        ],[ 18.        ],[ 19.        ],[ 22.        ],[  3.        ],[ 29.69911765],[ 22.        ],[ 27.        ],[ 20.        ],[ 19.        ],[ 42.        ],[  1.        ],[ 32.        ],[ 35.        ],[ 29.69911765],[ 18.        ],[  1.        ],[ 36.        ],[ 29.69911765],[ 17.        ],[ 36.        ],[ 21.        ],[ 28.        ],[ 23.        ],[ 24.        ],[ 22.        ],[ 31.        ],[ 46.        ],[ 23.        ],[ 28.        ],[ 39.        ],[ 26.        ],[ 21.        ],[ 28.        ],[ 20.        ],[ 34.        ],[ 51.        ],[  3.        ],[ 21.        ],[ 29.69911765],[ 29.69911765],[ 29.69911765],[ 33.        ],[ 29.69911765],[ 44.        ],[ 29.69911765],[ 34.        ],[ 18.        ],[ 30.        ],[ 10.        ],[ 29.69911765],[ 21.        ],[ 29.        ],[ 28.        ],[ 18.        ],[ 29.69911765],[ 28.        ],[ 19.        ],[ 29.69911765],[ 32.        ],[ 28.        ],[ 29.69911765],[ 42.        ],[ 17.        ],[ 50.        ],[ 14.        ],[ 21.        ],[ 24.        ],[ 64.        ],[ 31.        ],[ 45.        ],[ 20.        ],[ 25.        ],[ 28.        ],[ 29.69911765],[  4.        ],[ 13.        ],[ 34.        ],[  5.        ],[ 52.        ],[ 36.        ],[ 29.69911765],[ 30.        ],[ 49.        ],[ 29.69911765],[ 29.        ],[ 65.        ],[ 29.69911765],[ 50.        ],[ 29.69911765],[ 48.        ],[ 34.        ],[ 47.        ],[ 48.        ],[ 29.69911765],[ 38.        ],[ 29.69911765],[ 56.        ],[ 29.69911765],[  0.75      ],[ 29.69911765],[ 38.        ],[ 33.        ],[ 23.        ],[ 22.        ],[ 29.69911765],[ 34.        ],[ 29.        ],[ 22.        ],[  2.        ],[  9.        ],[ 29.69911765],[ 50.        ],[ 63.        ],[ 25.        ],[ 29.69911765],[ 35.        ],[ 58.        ],[ 30.        ],[  9.        ],[ 29.69911765],[ 21.        ],[ 55.        ],[ 71.        ],[ 21.        ],[ 29.69911765],[ 54.        ],[ 29.69911765],[ 25.        ],[ 24.        ],[ 17.        ],[ 21.        ],[ 29.69911765],[ 37.        ],[ 16.        ],[ 18.        ],[ 33.        ],[ 29.69911765],[ 28.        ],[ 26.        ],[ 29.        ],[ 29.69911765],[ 36.        ],[ 54.        ],[ 24.        ],[ 47.        ],[ 34.        ],[ 29.69911765],[ 36.        ],[ 32.        ],[ 30.        ],[ 22.        ],[ 29.69911765],[ 44.        ],[ 29.69911765],[ 40.5       ],[ 50.        ],[ 29.69911765],[ 39.        ],[ 23.        ],[  2.        ],[ 29.69911765],[ 17.        ],[ 29.69911765],[ 30.        ],[  7.        ],[ 45.        ],[ 30.        ],[ 29.69911765],[ 22.        ],[ 36.        ],[  9.        ],[ 11.        ],[ 32.        ],[ 50.        ],[ 64.        ],[ 19.        ],[ 29.69911765],[ 33.        ],[  8.        ],[ 17.        ],[ 27.        ],[ 29.69911765],[ 22.        ],[ 22.        ],[ 62.        ],[ 48.        ],[ 29.69911765],[ 39.        ],[ 36.        ],[ 29.69911765],[ 40.        ],[ 28.        ],[ 29.69911765],[ 29.69911765],[ 24.        ],[ 19.        ],[ 29.        ],[ 29.69911765],[ 32.        ],[ 62.        ],[ 53.        ],[ 36.        ],[ 29.69911765],[ 16.        ],[ 19.        ],[ 34.        ],[ 39.        ],[ 29.69911765],[ 32.        ],[ 25.        ],[ 39.        ],[ 54.        ],[ 36.        ],[ 29.69911765],[ 18.        ],[ 47.        ],[ 60.        ],[ 22.        ],[ 29.69911765],[ 35.        ],[ 52.        ],[ 47.        ],[ 29.69911765],[ 37.        ],[ 36.        ],[ 29.69911765],[ 49.        ],[ 29.69911765],[ 49.        ],[ 24.        ],[ 29.69911765],[ 29.69911765],[ 44.        ],[ 35.        ],[ 36.        ],[ 30.        ],[ 27.        ],[ 22.        ],[ 40.        ],[ 39.        ],[ 29.69911765],[ 29.69911765],[ 29.69911765],[ 35.        ],[ 24.        ],[ 34.        ],[ 26.        ],[  4.        ],[ 26.        ],[ 27.        ],[ 42.        ],[ 20.        ],[ 21.        ],[ 21.        ],[ 61.        ],[ 57.        ],[ 21.        ],[ 26.        ],[ 29.69911765],[ 80.        ],[ 51.        ],[ 32.        ],[ 29.69911765],[  9.        ],[ 28.        ],[ 32.        ],[ 31.        ],[ 41.        ],[ 29.69911765],[ 20.        ],[ 24.        ],[  2.        ],[ 29.69911765],[  0.75      ],[ 48.        ],[ 19.        ],[ 56.        ],[ 29.69911765],[ 23.        ],[ 29.69911765],[ 18.        ],[ 21.        ],[ 29.69911765],[ 18.        ],[ 24.        ],[ 29.69911765],[ 32.        ],[ 23.        ],[ 58.        ],[ 50.        ],[ 40.        ],[ 47.        ],[ 36.        ],[ 20.        ],[ 32.        ],[ 25.        ],[ 29.69911765],[ 43.        ],[ 29.69911765],[ 40.        ],[ 31.        ],[ 70.        ],[ 31.        ],[ 29.69911765],[ 18.        ],[ 24.5       ],[ 18.        ],[ 43.        ],[ 36.        ],[ 29.69911765],[ 27.        ],[ 20.        ],[ 14.        ],[ 60.        ],[ 25.        ],[ 14.        ],[ 19.        ],[ 18.        ],[ 15.        ],[ 31.        ],[  4.        ],[ 29.69911765],[ 25.        ],[ 60.        ],[ 52.        ],[ 44.        ],[ 29.69911765],[ 49.        ],[ 42.        ],[ 18.        ],[ 35.        ],[ 18.        ],[ 25.        ],[ 26.        ],[ 39.        ],[ 45.        ],[ 42.        ],[ 22.        ],[ 29.69911765],[ 24.        ],[ 29.69911765],[ 48.        ],[ 29.        ],[ 52.        ],[ 19.        ],[ 38.        ],[ 27.        ],[ 29.69911765],[ 33.        ],[  6.        ],[ 17.        ],[ 34.        ],[ 50.        ],[ 27.        ],[ 20.        ],[ 30.        ],[ 29.69911765],[ 25.        ],[ 25.        ],[ 29.        ],[ 11.        ],[ 29.69911765],[ 23.        ],[ 23.        ],[ 28.5       ],[ 48.        ],[ 35.        ],[ 29.69911765],[ 29.69911765],[ 29.69911765],[ 36.        ],[ 21.        ],[ 24.        ],[ 31.        ],[ 70.        ],[ 16.        ],[ 30.        ],[ 19.        ],[ 31.        ],[  4.        ],[  6.        ],[ 33.        ],[ 23.        ],[ 48.        ],[  0.67      ],[ 28.        ],[ 18.        ],[ 34.        ],[ 33.        ],[ 29.69911765],[ 41.        ],[ 20.        ],[ 36.        ],[ 16.        ],[ 51.        ],[ 29.69911765],[ 30.5       ],[ 29.69911765],[ 32.        ],[ 24.        ],[ 48.        ],[ 57.        ],[ 29.69911765],[ 54.        ],[ 18.        ],[ 29.69911765],[  5.        ],[ 29.69911765],[ 43.        ],[ 13.        ],[ 17.        ],[ 29.        ],[ 29.69911765],[ 25.        ],[ 25.        ],[ 18.        ],[  8.        ],[  1.        ],[ 46.        ],[ 29.69911765],[ 16.        ],[ 29.69911765],[ 29.69911765],[ 25.        ],[ 39.        ],[ 49.        ],[ 31.        ],[ 30.        ],[ 30.        ],[ 34.        ],[ 31.        ],[ 11.        ],[  0.42      ],[ 27.        ],[ 31.        ],[ 39.        ],[ 18.        ],[ 39.        ],[ 33.        ],[ 26.        ],[ 39.        ],[ 35.        ],[  6.        ],[ 30.5       ],[ 29.69911765],[ 23.        ],[ 31.        ],[ 43.        ],[ 10.        ],[ 52.        ],[ 27.        ],[ 38.        ],[ 27.        ],[  2.        ],[ 29.69911765],[ 29.69911765],[  1.        ],[ 29.69911765],[ 62.        ],[ 15.        ],[  0.83      ],[ 29.69911765],[ 23.        ],[ 18.        ],[ 39.        ],[ 21.        ],[ 29.69911765],[ 32.        ],[ 29.69911765],[ 20.        ],[ 16.        ],[ 30.        ],[ 34.5       ],[ 17.        ],[ 42.        ],[ 29.69911765],[ 35.        ],[ 28.        ],[ 29.69911765],[  4.        ],[ 74.        ],[  9.        ],[ 16.        ],[ 44.        ],[ 18.        ],[ 45.        ],[ 51.        ],[ 24.        ],[ 29.69911765],[ 41.        ],[ 21.        ],[ 48.        ],[ 29.69911765],[ 24.        ],[ 42.        ],[ 27.        ],[ 31.        ],[ 29.69911765],[  4.        ],[ 26.        ],[ 47.        ],[ 33.        ],[ 47.        ],[ 28.        ],[ 15.        ],[ 20.        ],[ 19.        ],[ 29.69911765],[ 56.        ],[ 25.        ],[ 33.        ],[ 22.        ],[ 28.        ],[ 25.        ],[ 39.        ],[ 27.        ],[ 19.        ],[ 29.69911765],[ 26.        ],[ 32.        ]])import numpy as np
log_age = df_train[['Age']].apply(lambda x:np.log(x))
log_age
Age
0   3.091042
1   3.637586
2   3.258097
3   3.555348
4   3.555348
5   NaN
6   3.988984
7   0.693147
8   3.295837
9   2.639057
10  1.386294
11  4.060443
12  2.995732
13  3.663562
14  2.639057
15  4.007333
16  0.693147
17  NaN
18  3.433987
19  NaN
20  3.555348
21  3.526361
22  2.708050
23  3.332205
24  2.079442
25  3.637586
26  NaN
27  2.944439
28  NaN
29  NaN
... ...
861 3.044522
862 3.871201
863 NaN
864 3.178054
865 3.737670
866 3.295837
867 3.433987
868 NaN
869 1.386294
870 3.258097
871 3.850148
872 3.496508
873 3.850148
874 3.332205
875 2.708050
876 2.995732
877 2.944439
878 NaN
879 4.025352
880 3.218876
881 3.496508
882 3.091042
883 3.332205
884 3.218876
885 3.663562
886 3.295837
887 2.944439
888 NaN
889 3.258097
890 3.465736
891 rows × 1 columnsfrom sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms.fit_transform(df_train[['Fare']])  #加2个中括号是把里面每个数都变成一个向量，那么整体就是array的格式from sklearn.preprocessing import StandardScaler
SS = StandardScaler()
SS.fit_transform(df_train[['Fare']])max_age = df_train['Fare'].max()
print(max_age)
512.3292min_age = df_train[['Age']].min()
print(min_age)
Age    0.42
dtype: float64df_train.loc[:,'family_size'] = df_train['SibSp'] + df_train['Parch'] + 1
df_train.head(10)
df_train['family_size']from sklearn.preprocessing import PolynomialFeaturesPnF = PolynomialFeatures()
Poly_fit = PnF.fit_transform(df_train[['SibSp']],df_train[['Parch']])
Poly_fit
array([[ 1.,  1.,  1.],[ 1.,  1.,  1.],[ 1.,  0.,  0.],..., [ 1.,  1.,  1.],[ 1.,  0.,  0.],[ 1.,  0.,  0.]])#cut将根据值本身来选择箱子均匀间隔，等长划分，qcut是根据这些值的频率来选择箱子的均匀间隔，等比划分。
cutdata = pd.qcut(df_train['Age'],8)
cutdatadf_train.loc[:,'fare_cut'] = pd.cut(df_train['Fare'],5)  #里面应该是一个一维的类数组对象
df_train.head(30)df_train.info()#one hot 处理
dfg = df_train.groupby(df_train['Embarked'])
dfg.describe()    #Embarled中有c , q ,s 三个分类#one hot 处理
embark_one_hot = pd.get_dummies(df_train['Embarked'])
embark_one_hotcar_time = pd.read_csv('car_data.csv')
car_time.head(10)car_time.loc[:,"date"] = pd.to_datetime(car_time["date_t"],format="")  #把object类型的date转成datetime的时间类型
car_time.head()
date_t  cnt date    month
0   2012-12-31  NaN 2012-12-31  12
1   2013-01-01  NaN 2013-01-01  1
2   2013-01-02  68.0    2013-01-02  1
3   2013-01-03  36.0    2013-01-03  1
4   2013-01-04  5565.0  2013-01-04  1car_time.loc[:,"month"] = car_time["date"].dt.monthcar_time.loc[:,"dom"] = car_time["date"].dt.day
car_time.head()
date_t  cnt date    month   dom
0   2012-12-31  NaN 2012-12-31  12  31
1   2013-01-01  NaN 2013-01-01  1   1
2   2013-01-02  68.0    2013-01-02  1   2
3   2013-01-03  36.0    2013-01-03  1   3
4   2013-01-04  5565.0  2013-01-04  1   4car_time.loc[:,"dow"] = car_time["date"].dt.dayofweek
car_time.loc[:,"weekend"] = car_time["dow"].apply(lambda x: 1 if (x == 6 or x == 1) else 0)  #不能用 lambda x:x==1，右边是个函数，这个函数输出true
car_time.head(10)
date_t  cnt date    month   dom dow weekend
0   2012-12-31  NaN 2012-12-31  12  31  0   0
1   2013-01-01  NaN 2013-01-01  1   1   1   1
2   2013-01-02  68.0    2013-01-02  1   2   2   0
3   2013-01-03  36.0    2013-01-03  1   3   3   0
4   2013-01-04  5565.0  2013-01-04  1   4   4   0
5   2013-01-05  4966.0  2013-01-05  1   5   5   0
6   2013-01-06  3346.0  2013-01-06  1   6   6   1
7   2013-01-07  3396.0  2013-01-07  1   7   0   0
8   2013-01-08  4146.0  2013-01-08  1   8   1   1
9   2013-01-09  3096.0  2013-01-09  1   9   2   0#特殊类型，文本型
#1. 词袋模型
from sklearn.feature_extraction.text import CountVectorizer   #计数器verctorize = CountVectorizer()  #初始化CountVectorizer这个类，这个类init不用传入参数corpus = ['This is the first document.','This is the second second document.','And the third one.','Is this the first document?']X = verctorize.fit_transform(corpus)verctorize.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']X.toarray() #['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'] 一一对应出现的次数
#总之，CountVectorizer 计算每个词在句子中出现的次数，并且形成向量化的形式，每个colum对应一个词，1代表and词在第一列出现了一次。
array([[0, 1, 1, 1, 0, 0, 1, 0, 1],[0, 1, 0, 1, 0, 2, 1, 0, 1],[1, 0, 0, 0, 1, 0, 1, 1, 0],[0, 1, 1, 1, 0, 0, 1, 0, 1]], dtype=int64)vec = CountVectorizer(ngram_range=(1,3))   #把关键字组合起来看次数
X_ngram = vec.fit_transform(corpus)
X_ngram.toarray()
array([[0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,0, 0, 0, 0, 1, 1, 1, 0, 0],[0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 2, 1, 1, 1, 1, 0, 0, 1, 1,0, 0, 0, 0, 1, 1, 1, 0, 0],[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,1, 1, 1, 1, 0, 0, 0, 0, 0],[0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,0, 0, 0, 0, 1, 0, 0, 1, 1]], dtype=int64)###TF-IDF  带权重的
from sklearn.feature_extraction.text import TfidfVectorizertfid = TfidfVectorizer()tfid_X =tfid.fit_transform(corpus)tfid.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']tfid_X.toarray()  #告诉哪个词在第几个句子所占的比重是多少
array([[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,0.        ,  0.35872874,  0.        ,  0.43877674],[ 0.        ,  0.27230147,  0.        ,  0.27230147,  0.        ,0.85322574,  0.22262429,  0.        ,  0.27230147],[ 0.55280532,  0.        ,  0.        ,  0.        ,  0.55280532,0.        ,  0.28847675,  0.55280532,  0.        ],[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,0.        ,  0.35872874,  0.        ,  0.43877674]])tfid.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']df_train.head()
PassengerId Survived    Pclass  Name    Sex Age SibSp   Parch   Ticket  Fare    Cabin   Embarked    family_size fare_cut
0   1   0   3   Braund, Mr. Owen Harris male    22.0    1   0   A/5 21171   7.2500  NaN S   2   (-0.512, 102.466]
1   2   1   1   Cumings, Mrs. John Bradley (Florence Briggs Th...   female  38.0    1   0   PC 17599    71.2833 C85 C   2   (-0.512, 102.466]
2   3   1   3   Heikkinen, Miss. Laina  female  26.0    0   0   STON/O2. 3101282    7.9250  NaN S   1   (-0.512, 102.466]
3   4   1   1   Futrelle, Mrs. Jacques Heath (Lily May Peel)    female  35.0    1   0   113803  53.1000 C123    S   2   (-0.512, 102.466]
4   5   0   3   Allen, Mr. William Henry    male    35.0    0   0   373450  8.0500  NaN S   1   (-0.512, 102.466]#借助条件去获取组合特征df_train.loc[:,"alone"] = (df_train['SibSp']==0)&(df_train['Parch']==0)df_train.head()
PassengerId Survived    Pclass  Name    Sex Age SibSp   Parch   Ticket  Fare    Cabin   Embarked    family_size fare_cut    alone
0   1   0   3   Braund, Mr. Owen Harris male    22.0    1   0   A/5 21171   7.2500  NaN S   2   (-0.512, 102.466]   False
1   2   1   1   Cumings, Mrs. John Bradley (Florence Briggs Th...   female  38.0    1   0   PC 17599    71.2833 C85 C   2   (-0.512, 102.466]   False
2   3   1   3   Heikkinen, Miss. Laina  female  26.0    0   0   STON/O2. 3101282    7.9250  NaN S   1   (-0.512, 102.466]   True
3   4   1   1   Futrelle, Mrs. Jacques Heath (Lily May Peel)    female  35.0    1   0   113803  53.1000 C123    S   2   (-0.512, 102.466]   False
4   5   0   3   Allen, Mr. William Henry    male    35.0    0   0   373450  8.0500  NaN S   1   (-0.512, 102.466]   True#过滤式选择更加好的特征
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBestiris = load_iris()
X, y = iris.data,iris.target
XX_new= SelectKBest(k=2).fit_transform(X,y)  #选2个特征，找到x中的两个特征，那么这2个特征就是更符合分类（也是y值）的特征。计算相关系数。
X_new.shape
(150, 2)X_new#递归的特征筛选
#包裹型 wrapper
from sklearn.feature_selection import RFE  #递归的特征筛选from sklearn.ensemble import RandomForestClassifier  #用rf去判定选择特征重要度
rf = RandomForestClassifier()
rfe = RFE(estimator=rf, n_features_to_select=2)X_rfe = rfe.fit_transform(X,y)X_rfe.shape
(150, 2)X_rfe[:5,:]  #最后2列
array([[ 1.4,  0.2],[ 1.4,  0.2],[ 1.3,  0.2],[ 1.5,  0.2],[ 1.4,  0.2]])#需要线性模型
#嵌入式
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC   #需要线性模型lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)  #l1正则化
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)  #l1正则化 model = SelectFromModel(lsvc, prefit=True)X_embed = model.transform(X)X_embed.shape
(150, 3)

特征工程和数据预处理常用工具和方法相关推荐

特征工程之数据预处理（下）
机器学习入门系列(2)–如何构建一个完整的机器学习项目,第四篇! 该系列的前三篇文章: 机器学习入门系列(2)–如何构建一个完整的机器学习项目(一) 机器学习数据集的获取和测试集的构建方法特征工程之 ...
机器学习笔记六——特征工程之数据预处理
特征工程之数据预处理 1. 处理缺失值 2. 处理异常值 2.1 异常值检测 2.2异常值处理 3.离散特征的连续化处理 4.连续特征的离散化处理 5. 处理类别不平衡问题 6. 图片数据扩充数据预 ...
机器学习 | 特征工程（数据预处理、特征抽取）
所谓特征工程即模型搭建之前进行的数据预处理和特征提取.有时人们常常好高骛远,数据都没处理好就开始折腾各种算法,从第一开始就有问题,那岂不是还没开始就已经结束了.所以说啊,不积跬步无以至千里,生活中的每 ...
机器学习实战——特征工程之数据预处理
机器学习实战的特征工程主要包含数据预处理.特征构建.特征选择三步,首先来介绍数据预处理. 我选择python作为工具,并将主要用到pandas.numpy等数据工具库.加载库: import pand ...
特征工程之数据预处理（上）
机器学习入门系列(2)–如何构建一个完整的机器学习项目,第三篇! 该系列的前两篇文章: 机器学习入门系列(2)–如何构建一个完整的机器学习项目(一) 机器学习数据集的获取和测试集的构建方法分别介绍了 ...
【数据平台】sklearn库特征工程之数据预处理
1.背景: 通过特征提取,我们能得到未经处理的特征,这时的特征可能有以下问题: 不属于同一量纲:即特征的规格不一样,不能够放在一起比较.无量纲化可以解决这一问题. 信息冗余:对于某些定量特征,其包含的 ...
多特征值数据预处理_「人工智能」No.6 特征工程之数据预处理
[导读:当今人类即将或者已然了进入智能时代,这是·情报通·人工智能科普系列第[6]篇文章,欢迎阅读和收藏!] 1 基本概念 "数据和特征决定了机器学习的上限,而模型和算法只是逼近这个上限而已 ...
机器学习系列(3)_特征工程01数据预处理
参考链接: 1.scikit-learn官网 2.sklearn提供的自带的数据集 3.Kaggle官网 4.数据挖掘--无量纲化文章目录一.数据中台二.sklearn中的数据预处理与特征工程 ...
特征工程之数据预处理与可视化
文章目录前言一.数据导入与查看二.数据操作三.可视化 1.显示两个特征的关系 2.热力图前言对于数据处理,我们可以运用python的一些库来完成和实现,下面是一些常用的程序代码总结一.数 ...

特征工程和数据预处理常用工具和方法

特征工程和数据预处理常用工具和方法相关推荐

最新文章

热门文章