#coding:utf-8
#导入warnings包，利用过滤器来实现忽略警告语句。
import warnings
warnings.filterwarnings('ignore')import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

## 1) 载入训练集和测试集；
path = './'
Train_data = pd.read_csv(path+'car_train_0110.csv', sep=' ')
Test_data = pd.read_csv(path+'car_testA_0110.csv', sep=' ')

Train_data.head()

	SaleID	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	...	v_14	v_15	v_16	v_17	v_18	v_19	v_20	v_21	v_22	v_23
0	134890	734	20160002	13.0	9	NaN	0.0	1.0	0	15.0	...	0.092139	0.000000	18.763832	-1.512063	-1.008718	-12.100623	-0.947052	9.077297	0.581214	3.945923
1	306648	196973	20080307	72.0	9	7.0	5.0	1.0	173	15.0	...	0.001070	0.122335	-5.685612	-0.489963	-2.223693	-0.226865	-0.658246	-3.949621	4.593618	-1.145653
2	340675	25347	20020312	18.0	12	3.0	0.0	1.0	50	12.5	...	0.064410	0.003345	-3.295700	1.816499	3.554439	-0.683675	0.971495	2.625318	-0.851922	-1.246135
3	57332	5382	20000611	38.0	8	7.0	0.0	1.0	54	15.0	...	0.069231	0.000000	-3.405521	1.497826	4.782636	0.039101	1.227646	3.040629	-0.801854	-1.251894
4	265235	173174	20030109	87.0	0	5.0	5.0	1.0	131	3.0	...	0.000099	0.001655	-4.475429	0.124138	1.364567	-0.319848	-1.131568	-3.303424	-1.998466	-1.279368

5 rows × 40 columns

所有特征集均脱敏处理(方便大家观看)¶
name - 汽车编码
regDate - 汽车注册时间
model - 车型编码
brand - 品牌
bodyType - 车身类型
fuelType - 燃油类型
gearbox - 变速箱
power - 汽车功率
kilometer - 汽车行驶公里
notRepairedDamage - 汽车有尚未修复的损坏
regionCode - 看车地区编码
seller - 销售方
offerType - 报价类型
creatDate - 广告发布时间
price - 汽车价格
v_0’, ‘v_1’, ‘v_2’, ‘v_3’, ‘v_4’, ‘v_5’, ‘v_6’, ‘v_7’, ‘v_8’, ‘v_9’, ‘v_10’, ‘v_11’, ‘v_12’, ‘v_13’,‘v_14’ 【匿名特征，包含v0-14在内15个匿名特征】

Train_data.head().append(Test_data.tail())

	SaleID	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	...	v_14	v_15	v_16	v_17	v_18	v_19	v_20	v_21	v_22	v_23
0	134890	734	20160002	13.0	9	NaN	0.0	1.0	0	15.0	...	0.092139	0.000000	18.763832	-1.512063	-1.008718	-12.100623	-0.947052	9.077297	0.581214	3.945923
1	306648	196973	20080307	72.0	9	7.0	5.0	1.0	173	15.0	...	0.001070	0.122335	-5.685612	-0.489963	-2.223693	-0.226865	-0.658246	-3.949621	4.593618	-1.145653
2	340675	25347	20020312	18.0	12	3.0	0.0	1.0	50	12.5	...	0.064410	0.003345	-3.295700	1.816499	3.554439	-0.683675	0.971495	2.625318	-0.851922	-1.246135
3	57332	5382	20000611	38.0	8	7.0	0.0	1.0	54	15.0	...	0.069231	0.000000	-3.405521	1.497826	4.782636	0.039101	1.227646	3.040629	-0.801854	-1.251894
4	265235	173174	20030109	87.0	0	5.0	5.0	1.0	131	3.0	...	0.000099	0.001655	-4.475429	0.124138	1.364567	-0.319848	-1.131568	-3.303424	-1.998466	-1.279368
49995	375033	3803	20010407	6.0	29	5.0	0.0	0.0	186	10.0	...	0.000000	0.000372	-3.397636	0.940183	4.115667	0.146320	-2.348749	-2.636560	-0.965214	-1.097192
49996	406556	28500	20071001	130.0	10	2.0	0.0	0.0	272	7.0	...	0.003208	0.116459	-7.055336	-1.260228	-4.937979	0.881517	-1.590285	-3.495608	3.301887	3.947193
49997	511668	98383	19980102	23.0	10	4.0	0.0	1.0	190	0.5	...	0.049580	0.067015	-4.916501	0.507919	-0.035475	0.256285	0.734084	0.779931	1.822416	5.012697
49998	533139	1489	20031001	70.0	1	7.0	4.0	NaN	101	15.0	...	0.084591	0.000000	-0.424439	3.893203	-0.146884	1.830694	18.008141	-2.513048	-3.310876	-1.589404
49999	592803	994	20070407	76.0	0	4.0	5.0	NaN	0	15.0	...	0.055724	0.110924	-1.422750	2.749703	-2.160718	0.838089	17.664283	-5.802325	3.063008	-1.308131

10 rows × 40 columns

Train_data.shape

(250000, 40)

Test_data.head().append(Test_data.tail())

	SaleID	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	...	v_14	v_15	v_16	v_17	v_18	v_19	v_20	v_21	v_22	v_23
0	720326	505	20060505	19.0	13	7.0	0.0	1.0	90	8.0	...	0.083340	0.105382	-5.998993	0.147048	-1.902847	0.348990	2.324961	3.343910	4.048742	-1.431822
1	714316	1836	20010301	5.0	5	3.0	4.0	1.0	75	15.0	...	0.074478	0.000000	-3.287221	2.081317	2.937052	-0.123018	1.202395	3.570743	-1.180587	-1.348598
2	704693	212291	20170610	6.0	18	NaN	5.0	0.0	150	15.0	...	0.002032	0.000000	4.368218	8.252188	-4.136109	-13.334970	-4.444620	-0.706978	-1.720218	3.569112
3	624972	1345	19820005	215.0	32	7.0	0.0	1.0	0	6.0	...	0.098806	0.100883	-2.537486	0.513955	4.414962	0.357685	2.700732	5.323602	6.085956	-0.900585
4	669753	1428	20060205	30.0	4	7.0	5.0	1.0	122	15.0	...	0.088397	0.002509	-6.197633	-0.191814	-1.224360	-0.326985	2.254931	4.183037	-2.574004	0.014203
49995	375033	3803	20010407	6.0	29	5.0	0.0	0.0	186	10.0	...	0.000000	0.000372	-3.397636	0.940183	4.115667	0.146320	-2.348749	-2.636560	-0.965214	-1.097192
49996	406556	28500	20071001	130.0	10	2.0	0.0	0.0	272	7.0	...	0.003208	0.116459	-7.055336	-1.260228	-4.937979	0.881517	-1.590285	-3.495608	3.301887	3.947193
49997	511668	98383	19980102	23.0	10	4.0	0.0	1.0	190	0.5	...	0.049580	0.067015	-4.916501	0.507919	-0.035475	0.256285	0.734084	0.779931	1.822416	5.012697
49998	533139	1489	20031001	70.0	1	7.0	4.0	NaN	101	15.0	...	0.084591	0.000000	-0.424439	3.893203	-0.146884	1.830694	18.008141	-2.513048	-3.310876	-1.589404
49999	592803	994	20070407	76.0	0	4.0	5.0	NaN	0	15.0	...	0.055724	0.110924	-1.422750	2.749703	-2.160718	0.838089	17.664283	-5.802325	3.063008	-1.308131

10 rows × 39 columns

Test_data.shape

(50000, 39)

Train_data.describe()

	SaleID	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	...	v_14	v_15	v_16	v_17	v_18	v_19	v_20	v_21	v_22	v_23
count	250000.000000	250000.000000	2.500000e+05	250000.000000	250000.000000	224620.000000	227510.000000	236487.000000	250000.000000	250000.000000	...	250000.000000	250000.000000	250000.000000	250000.000000	250000.000000	250000.000000	250000.000000	250000.000000	250000.000000	250000.000000
mean	185351.790768	83153.362172	2.003401e+07	44.911480	7.785236	4.563271	1.665008	0.780783	115.528412	12.577418	...	0.032489	0.030408	0.014725	0.000915	0.006273	0.006604	-0.001374	0.000609	-0.004025	0.001834
std	107121.188763	72540.799964	7.770250e+04	50.640081	7.694010	1.912515	2.339646	0.413717	196.141828	3.990632	...	0.038792	0.049333	8.779163	5.771081	4.880981	4.124722	3.803626	3.555353	2.864713	2.323680
min	1.000000	0.000000	1.910000e+07	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.500000	...	0.000000	0.000000	-10.412444	-15.538236	-21.009214	-13.989955	-9.599285	-11.181255	-7.671327	-2.350888
25%	92501.750000	14500.000000	1.999061e+07	6.000000	1.000000	3.000000	0.000000	1.000000	70.000000	12.500000	...	0.000129	0.000000	-5.552269	-0.901181	-3.150385	-0.478173	-1.727237	-3.067073	-2.092178	-1.402804
50%	185264.500000	65314.500000	2.003111e+07	27.000000	6.000000	4.000000	0.000000	1.000000	105.000000	15.000000	...	0.001961	0.002567	-3.821770	0.223181	-0.058502	0.038427	-0.995044	-0.880587	-1.199807	-1.145588
75%	278128.500000	143761.250000	2.008081e+07	70.000000	11.000000	7.000000	5.000000	1.000000	150.000000	15.000000	...	0.075672	0.056568	3.599747	1.263737	2.800475	0.569198	1.563382	3.269987	2.737614	0.044865
max	370946.000000	233044.000000	2.019121e+07	250.000000	39.000000	7.000000	6.000000	1.000000	20000.000000	15.000000	...	0.130785	0.184340	36.756878	26.134561	23.055660	16.576027	20.324572	14.039422	8.764597	8.574730

8 rows × 40 columns

Test_data.describe()

	SaleID	name	regDate	model	brand	bodyType	fuelType	gearbox	power	kilometer	...	v_14	v_15	v_16	v_17	v_18	v_19	v_20	v_21	v_22	v_23
count	50000.000000	50000.000000	5.000000e+04	50000.000000	50000.000000	44890.000000	45598.000000	47287.000000	50000.000000	50000.000000	...	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000	50000.000000
mean	556029.053380	82878.251420	2.003441e+07	44.922840	7.779420	4.556226	1.681192	0.781081	114.116060	12.555210	...	0.032570	0.030773	-0.024819	0.007051	-0.008488	-0.030104	0.014609	-0.003353	0.013125	-0.011936
std	106952.402565	72292.076936	7.788055e+04	50.576255	7.661667	1.908291	2.344829	0.413518	177.274154	4.034901	...	0.038779	0.049521	8.759663	5.784299	4.825261	4.100561	3.812667	3.548944	2.866774	2.316144
min	370951.000000	0.000000	1.910000e+07	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.500000	...	0.000000	0.000000	-10.196998	-15.167961	-21.925773	-13.682825	-9.282567	-11.117367	-6.365723	-2.394516
25%	463258.500000	14121.250000	1.999061e+07	6.000000	1.000000	3.000000	0.000000	1.000000	69.000000	12.500000	...	0.000135	0.000000	-5.575131	-0.891030	-3.105073	-0.481952	-1.697763	-3.069575	-2.089326	-1.402958
50%	556296.000000	65359.000000	2.003111e+07	27.000000	6.000000	4.000000	0.000000	1.000000	105.000000	15.000000	...	0.001949	0.002593	-3.837572	0.221379	-0.081836	0.039376	-0.971210	-0.877377	-1.192502	-1.146398
75%	648862.250000	143083.750000	2.008091e+07	70.000000	11.000000	7.000000	5.000000	1.000000	150.000000	15.000000	...	0.075826	0.062063	3.531269	1.257687	2.784538	0.560046	1.572508	3.276918	2.772742	-0.010769
max	741887.000000	233028.000000	2.019040e+07	248.000000	39.000000	7.000000	6.000000	1.000000	17700.000000	15.000000	...	0.135900	0.180091	36.364986	26.043572	22.598441	16.333051	20.273633	11.691851	7.970303	8.749647

8 rows × 39 columns

## 2) 通过info()来熟悉数据类型
Train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 40 columns):#   Column             Non-Null Count   Dtype
---  ------             --------------   -----  0   SaleID             250000 non-null  int64  1   name               250000 non-null  int64  2   regDate            250000 non-null  int64  3   model              250000 non-null  float644   brand              250000 non-null  int64  5   bodyType           224620 non-null  float646   fuelType           227510 non-null  float647   gearbox            236487 non-null  float648   power              250000 non-null  int64  9   kilometer          250000 non-null  float6410  notRepairedDamage  201464 non-null  float6411  regionCode         250000 non-null  int64  12  seller             250000 non-null  int64  13  offerType          250000 non-null  int64  14  creatDate          250000 non-null  int64  15  price              250000 non-null  int64  16  v_0                250000 non-null  float6417  v_1                250000 non-null  float6418  v_2                250000 non-null  float6419  v_3                250000 non-null  float6420  v_4                250000 non-null  float6421  v_5                250000 non-null  float6422  v_6                250000 non-null  float6423  v_7                250000 non-null  float6424  v_8                250000 non-null  float6425  v_9                250000 non-null  float6426  v_10               250000 non-null  float6427  v_11               250000 non-null  float6428  v_12               250000 non-null  float6429  v_13               250000 non-null  float6430  v_14               250000 non-null  float6431  v_15               250000 non-null  float6432  v_16               250000 non-null  float6433  v_17               250000 non-null  float6434  v_18               250000 non-null  float6435  v_19               250000 non-null  float6436  v_20               250000 non-null  float6437  v_21               250000 non-null  float6438  v_22               250000 non-null  float6439  v_23               250000 non-null  float64
dtypes: float64(30), int64(10)
memory usage: 76.3 MB

Test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 39 columns):#   Column             Non-Null Count  Dtype
---  ------             --------------  -----  0   SaleID             50000 non-null  int64  1   name               50000 non-null  int64  2   regDate            50000 non-null  int64  3   model              50000 non-null  float644   brand              50000 non-null  int64  5   bodyType           44890 non-null  float646   fuelType           45598 non-null  float647   gearbox            47287 non-null  float648   power              50000 non-null  int64  9   kilometer          50000 non-null  float6410  notRepairedDamage  40372 non-null  float6411  regionCode         50000 non-null  int64  12  seller             50000 non-null  int64  13  offerType          50000 non-null  int64  14  creatDate          50000 non-null  int64  15  v_0                50000 non-null  float6416  v_1                50000 non-null  float6417  v_2                50000 non-null  float6418  v_3                50000 non-null  float6419  v_4                50000 non-null  float6420  v_5                50000 non-null  float6421  v_6                50000 non-null  float6422  v_7                50000 non-null  float6423  v_8                50000 non-null  float6424  v_9                50000 non-null  float6425  v_10               50000 non-null  float6426  v_11               50000 non-null  float6427  v_12               50000 non-null  float6428  v_13               50000 non-null  float6429  v_14               50000 non-null  float6430  v_15               50000 non-null  float6431  v_16               50000 non-null  float6432  v_17               50000 non-null  float6433  v_18               50000 non-null  float6434  v_19               50000 non-null  float6435  v_20               50000 non-null  float6436  v_21               50000 non-null  float6437  v_22               50000 non-null  float6438  v_23               50000 non-null  float64
dtypes: float64(30), int64(9)
memory usage: 14.9 MB

Train_data.isnull().sum()

SaleID                   0
name                     0
regDate                  0
model                    0
brand                    0
bodyType             25380
fuelType             22490
gearbox              13513
power                    0
kilometer                0
notRepairedDamage    48536
regionCode               0
seller                   0
offerType                0
creatDate                0
price                    0
v_0                      0
v_1                      0
v_2                      0
v_3                      0
v_4                      0
v_5                      0
v_6                      0
v_7                      0
v_8                      0
v_9                      0
v_10                     0
v_11                     0
v_12                     0
v_13                     0
v_14                     0
v_15                     0
v_16                     0
v_17                     0
v_18                     0
v_19                     0
v_20                     0
v_21                     0
v_22                     0
v_23                     0
dtype: int64

Test_data.isnull().sum()

SaleID                  0
name                    0
regDate                 0
model                   0
brand                   0
bodyType             5110
fuelType             4402
gearbox              2713
power                   0
kilometer               0
notRepairedDamage    9628
regionCode              0
seller                  0
offerType               0
creatDate               0
v_0                     0
v_1                     0
v_2                     0
v_3                     0
v_4                     0
v_5                     0
v_6                     0
v_7                     0
v_8                     0
v_9                     0
v_10                    0
v_11                    0
v_12                    0
v_13                    0
v_14                    0
v_15                    0
v_16                    0
v_17                    0
v_18                    0
v_19                    0
v_20                    0
v_21                    0
v_22                    0
v_23                    0
dtype: int64

# nan可视化
missing = Train_data.isnull().sum()
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar()

<AxesSubplot:>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IF3bYWr1-1618460688068)(output_14_1.png)]

# 可视化看下缺省值
msno.matrix(Train_data.sample(250))

<AxesSubplot:>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Hmg3T2pE-1618460688071)(output_15_1.png)]

msno.bar(Train_data.sample(1000))

<AxesSubplot:>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-giHsOp0o-1618460688080)(output_16_1.png)]

msno.bar(Train_data.sample(1000))

<AxesSubplot:>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-O6oFq5c8-1618460688082)(output_17_1.png)]

Train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 40 columns):#   Column             Non-Null Count   Dtype
---  ------             --------------   -----  0   SaleID             250000 non-null  int64  1   name               250000 non-null  int64  2   regDate            250000 non-null  int64  3   model              250000 non-null  float644   brand              250000 non-null  int64  5   bodyType           224620 non-null  float646   fuelType           227510 non-null  float647   gearbox            236487 non-null  float648   power              250000 non-null  int64  9   kilometer          250000 non-null  float6410  notRepairedDamage  201464 non-null  float6411  regionCode         250000 non-null  int64  12  seller             250000 non-null  int64  13  offerType          250000 non-null  int64  14  creatDate          250000 non-null  int64  15  price              250000 non-null  int64  16  v_0                250000 non-null  float6417  v_1                250000 non-null  float6418  v_2                250000 non-null  float6419  v_3                250000 non-null  float6420  v_4                250000 non-null  float6421  v_5                250000 non-null  float6422  v_6                250000 non-null  float6423  v_7                250000 non-null  float6424  v_8                250000 non-null  float6425  v_9                250000 non-null  float6426  v_10               250000 non-null  float6427  v_11               250000 non-null  float6428  v_12               250000 non-null  float6429  v_13               250000 non-null  float6430  v_14               250000 non-null  float6431  v_15               250000 non-null  float6432  v_16               250000 non-null  float6433  v_17               250000 non-null  float6434  v_18               250000 non-null  float6435  v_19               250000 non-null  float6436  v_20               250000 non-null  float6437  v_21               250000 non-null  float6438  v_22               250000 non-null  float6439  v_23               250000 non-null  float64
dtypes: float64(30), int64(10)
memory usage: 76.3 MB

Test_data['notRepairedDamage'].value_counts()

1.0    35555
0.0     4817
Name: notRepairedDamage, dtype: int64

Test_data['notRepairedDamage'].replace('-', np.nan, inplace=True)

Train_data["seller"].value_counts()

1    249999
0         1
Name: seller, dtype: int64

Train_data["offerType"].value_counts()

0    249991
1         9
Name: offerType, dtype: int64

del Train_data["seller"]
del Train_data["offerType"]
del Test_data["seller"]
del Test_data["offerType"]
#drop函数用于pandas

---------------------------------------------------------------------------KeyError                                  Traceback (most recent call last)C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)3079             try:
-> 3080                 return self._engine.get_loc(casted_key)3081             except KeyError as err:pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()KeyError: 'seller'The above exception was the direct cause of the following exception:KeyError                                  Traceback (most recent call last)<ipython-input-25-ac78d43311cb> in <module>
----> 1 del Train_data["seller"]2 del Train_data["offerType"]3 del Test_data["seller"]4 del Test_data["offerType"]5 #drop函数用于pandasC:\ProgramData\Anaconda3\lib\site-packages\pandas\core\generic.py in __delitem__(self, key)3964             # there was no match, this call should raise the appropriate3965             # exception:
-> 3966             loc = self.axes[-1].get_loc(key)3967             self._mgr.idelete(loc)3968 C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)3080                 return self._engine.get_loc(casted_key)3081             except KeyError as err:
-> 3082                 raise KeyError(key) from err3083 3084         if tolerance is not None:KeyError: 'seller'

Train_data['price']

0           520
1          5500
2          1100
3          1200
4          3300...
249995     1200
249996     1200
249997    16500
249998    31950
249999     1990
Name: price, Length: 250000, dtype: int64

Train_data['price'].value_counts()

0        7312
500      3815
1500     3587
1000     3149
1200     3071...
11320       1
7230        1
11448       1
9529        1
8188        1
Name: price, Length: 4585, dtype: int64

%matplotlib inline  #IPython notebook中的魔法方法，这样每次运行后可以直接得到图像，不再需要使用plt.show()
import numpy as np  #导入numpy包，用于生成数组
import seaborn as sns  #习惯上简写成sns
sns.set()#切换到seaborn的默认运行配置

UsageError: unrecognized arguments: #IPython notebook中的魔法方法，这样每次运行后可以直接得到图像，不再需要使用plt.show()

x=np.random.randn(100)
sns.kdeplot(x,cut=0)

<AxesSubplot:ylabel='Density'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BZyyCorw-1618460688084)(output_27_1.png)]

y=np.random.randn(100)
sns.kdeplot(x,y,shade=True)
sns.kdeplot(x,y,shade=True,cbar=True)

<AxesSubplot:>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UA8LOj4n-1618460688086)(output_28_1.png)]

displot()集合了matplotlib的hist()与核函数估计kdeplot的功能，增加了rugplot分布观测条显示与利用scipy库fit拟合参数分布的新颖用途。具体用法如下：

seaborn.distplot(a,bins=None,hist=True,kde=True, rug=False, fit=None, hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None, color=None, vertical=False, norm_hist=False, axlabel=None, label=None, ax=None)
先介绍一下直方图(Histograms)：

直方图又称质量分布图，它是表示资料变化情况的一种主要工具。用直方图可以解析出资料的规则性，比较直观地看出产品质量特性的分布状态，对于资料分布状况一目了然，便于判断其总体质量分布情况。直方图表示通过沿数据范围形成分箱，然后绘制条以显示落入每个分箱的观测次数的数据分布。

接下来还是通过具体的例子来体验一下distplot的用法：

sns.distplot(x,color="g")

<AxesSubplot:ylabel='Density'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uPlnSZg2-1618460688087)(output_31_1.png)]

import matplotlib.pyplot as plt
fig,axes=plt.subplots(1,3) #创建一个一行三列的画布
sns.distplot(x,ax=axes[0]) #左图
sns.distplot(x,hist=False,ax=axes[1]) #中图
sns.distplot(x,kde=False,ax=axes[2]) #右图

<AxesSubplot:>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QB16zc3q-1618460688088)(output_32_1.png)]

## 1) 总体分布概况（无界约翰逊分布等）
#简称约翰逊分布。经约翰变换后服从正态分布的随机变量的概率分布
import scipy.stats as st
y = Train_data['price']
plt.figure(1); plt.title('Johnson SU')
sns.distplot(y, kde=True, fit=st.johnsonsu)
plt.figure(2); plt.title('Normal')
sns.distplot(y, kde=False, fit=st.norm)
plt.figure(3); plt.title('Log Normal')
sns.distplot(y, kde=False, fit=st.lognorm)
'''我们看看他符合什么总体分布
无界约翰逊分布johnsonsu？
正态norm？
对数正态（比正态偏上一点）lognorm？'''

'我们看看他符合什么总体分布\n无界约翰逊分布johnsonsu？\n正态norm？\n对数正态（比正态偏上一点）lognorm？\n\n'

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-BIRc6ZH0-1618460688091)(output_33_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UXCH7aXj-1618460688093)(output_33_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dVSpJeqf-1618460688093)(output_33_3.png)]

2) 查看skewness偏度和 kurtosis峰度

偏度: 是描述数据分布形态的统计量，其描述的是某总体取值分布的对称性，简单来说就是数据的不对称程度,绝对值越大表明数据分布越不对称，偏斜程度大

峰度: 描述某变量所有取值分布形态陡缓程度的统计量，简单来说就是数据分布顶的尖锐程度(>0尖顶峰, <0平顶峰, =0与正态分布陡峭程度一致)

## 2) 查看skewness and kurtosis
sns.distplot(Train_data['price']);
print("Skewness: %f" % Train_data['price'].skew())
print("Kurtosis: %f" % Train_data['price'].kurt())

Skewness: 3.535346
Kurtosis: 21.230678

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-sTZN54X1-1618460688095)(output_35_1.png)]

Train_data.skew(), Train_data.kurt()

(SaleID                0.001712name                  0.513079regDate              -1.540844model                 1.499765brand                 1.314846bodyType             -0.070459fuelType              0.701802gearbox              -1.357379power                58.590829kilometer            -1.557472notRepairedDamage    -2.312519regionCode            0.690405creatDate           -95.428563price                 3.535346v_0                  -1.504738v_1                   1.582428v_2                   1.198679v_3                   1.352193v_4                   0.217941v_5                   2.052749v_6                   0.090718v_7                   0.823610v_8                  -1.532964v_9                   1.529931v_10                 -2.584452v_11                 -0.906428v_12                 -2.842834v_13                 -3.869655v_14                  0.491706v_15                  1.308716v_16                  1.662893v_17                  0.233318v_18                  0.814453v_19                  0.100073v_20                  2.001253v_21                  0.180020v_22                  0.819133v_23                  1.357847dtype: float64,SaleID                  -1.201476name                    -1.084474regDate                 11.041006model                    1.741896brand                    1.814245bodyType                -1.070358fuelType                -1.495782gearbox                 -0.157525power                 4473.885260kilometer                1.250933notRepairedDamage        3.347777regionCode              -0.352973creatDate            11376.694263price                   21.230678v_0                      2.901641v_1                      1.098703v_2                      3.749872v_3                      4.294578v_4                      6.953348v_5                      6.489791v_6                     -0.564878v_7                     -0.729838v_8                      0.370812v_9                      0.377943v_10                     4.796855v_11                     1.547812v_12                     6.136342v_13                    13.199575v_14                    -1.597532v_15                    -0.029594v_16                     2.240928v_17                     2.569341v_18                     2.967738v_19                     6.923953v_20                     6.852809v_21                    -0.759948v_22                    -0.741708v_23                     0.143713dtype: float64)

sns.distplot(Train_data.skew(),color='blue',axlabel ='Skewness')

<AxesSubplot:xlabel='Skewness', ylabel='Density'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-imnSDRwd-1618460688096)(output_37_1.png)]

sns.distplot(Train_data.kurt(),color='orange',axlabel ='Kurtness')

<AxesSubplot:xlabel='Kurtness', ylabel='Density'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EnGtRb0Y-1618460688099)(output_38_1.png)]

## 3) 查看预测值的具体频数
plt.hist(Train_data['price'], orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rmh7p764-1618460688100)(output_39_0.png)]

plt.hist
x: 数据集，最终的直方图将对数据集进行统计
bins: 统计的区间分布
range: tuple, 显示的区间，range在没有给出bins时生效
density: bool，默认为false，显示的是频数统计结果，为True则显示频率统计结果，这里需要注意，频率统计结果=区间数目/(总数*区间宽度)，和normed效果一致，官方推荐使用density
histtype: 可选{‘bar’, ‘barstacked’, ‘step’, ‘stepfilled’}之一，默认为bar，推荐使用默认配置，step使用的是梯状，stepfilled则会对梯状内部进行填充，效果与bar类似
align: 可选{‘left’, ‘mid’, ‘right’}之一，默认为’mid’，控制柱状图的水平分布，left或者right，会有部分空白区域，推荐使用默认
log: bool，默认False,即y坐标轴是否选择指数刻度
stacked: bool，默认为False，是否为堆积状图

plt.hist绘制直方图参数density 为True和False分别代表是否归一化参数orientation决定了是采用纵轴代表频率还是横轴代表频率的展现形式

# log变换 z之后的分布较均匀，可以进行log变换进行预测，这也是预测问题常用的trick
plt.hist(np.log(Train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red')
plt.show()

---------------------------------------------------------------------------ValueError                                Traceback (most recent call last)<ipython-input-41-0f7fcb2a3190> in <module>1 # log变换 z之后的分布较均匀，可以进行log变换进行预测，这也是预测问题常用的trick
----> 2 plt.hist(np.log(Train_data['price']), orientation = 'vertical',histtype = 'bar', color ='red')3 plt.show()C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\pyplot.py in hist(x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, data, **kwargs)2683         orientation='vertical', rwidth=None, log=False, color=None,2684         label=None, stacked=False, *, data=None, **kwargs):
-> 2685     return gca().hist(2686         x, bins=bins, range=range, density=density, weights=weights,2687         cumulative=cumulative, bottom=bottom, histtype=histtype,C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs)1445     def inner(ax, *args, data=None, **kwargs):1446         if data is None:
-> 1447             return func(ax, *map(sanitize_sequence, args), **kwargs)1448 1449         bound = new_sig.bind(ax, *args, **kwargs)C:\ProgramData\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py in hist(self, x, bins, range, density, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)6649             # this will automatically overwrite bins,6650             # so that each histogram uses the same bins
-> 6651             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)6652             tops.append(m)6653         tops = np.array(tops, float)  # causes problems later if it's an int<__array_function__ internals> in histogram(*args, **kwargs)C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in histogram(a, bins, range, normed, weights, density)790     a, weights = _ravel_and_check_weights(a, weights)791
--> 792     bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)793 794     # Histogram is an integer or a float array depending on the weights.C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_bin_edges(a, bins, range, weights)424             raise ValueError('`bins` must be positive, when an integer')425
--> 426         first_edge, last_edge = _get_outer_edges(a, range)427 428     elif np.ndim(bins) == 1:C:\ProgramData\Anaconda3\lib\site-packages\numpy\lib\histograms.py in _get_outer_edges(a, range)313                 'max must be larger than min in range parameter.')314         if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
--> 315             raise ValueError(316                 "supplied range of [{}, {}] is not finite".format(first_edge, last_edge))317     elif a.size == 0:ValueError: supplied range of [-inf, 11.512925464970229] is not finite

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DSC7R3b8-1618460688103)(output_41_1.png)]

# 分离label即预测值
Y_train = Train_data['price']

# 这个区别方式适用于没有直接label coding的数据
# 这里不适用，需要人为根据实际含义来区分
# 数字特征
# numeric_features = Train_data.select_dtypes(include=[np.number])
# numeric_features.columns
# # 类型特征
# categorical_features = Train_data.select_dtypes(include=[np.object])
# categorical_features.columns

numeric_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13','v_14' ]categorical_features = ['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'notRepairedDamage', 'regionCode',]

unique()是以数组形式（numpy.ndarray）返回列的所有唯一值（特征的所有唯一值）

nunique() Return number of unique elements in the object.即返回的是唯一值的个数

# 特征nunique分布
for cat_fea in categorical_features:print(cat_fea + "的特征分布如下：")print("{}特征有个{}不同的值".format(cat_fea, Train_data[cat_fea].nunique()))print(Train_data[cat_fea].value_counts())

name的特征分布如下：
name特征有个164312不同的值
451       452
73        429
1791      428
821       391
243       346...
92419       1
88325       1
82182       1
84231       1
157427      1
Name: name, Length: 164312, dtype: int64
model的特征分布如下：
model特征有个251不同的值
0.0      20344
6.0      17741
4.0      13837
1.0      13634
12.0      8841...
226.0        5
245.0        5
243.0        4
249.0        4
250.0        1
Name: model, Length: 251, dtype: int64
brand的特征分布如下：
brand特征有个40不同的值
0     53699
4     27109
11    26944
10    23762
1     22144
6     17202
9     12210
5      7343
15     6500
12     4704
7      3839
3      3831
17     3543
13     3502
8      3374
28     3161
19     2561
18     2451
16     2274
22     2264
23     2088
14     1892
24     1678
25     1611
20     1610
27     1392
29     1259
34      963
30      604
2       570
31      540
21      522
38      516
35      415
32      406
36      377
33      368
37      324
26      307
39      141
Name: brand, dtype: int64
bodyType的特征分布如下：
bodyType特征有个8不同的值
7.0    64571
3.0    53858
4.0    45646
5.0    20343
6.0    15290
2.0    12755
1.0     9882
0.0     2275
Name: bodyType, dtype: int64
fuelType的特征分布如下：
fuelType特征有个7不同的值
0.0    150664
5.0     72494
4.0      3577
3.0       385
2.0       183
1.0       147
6.0        60
Name: fuelType, dtype: int64
gearbox的特征分布如下：
gearbox特征有个2不同的值
1.0    184645
0.0     51842
Name: gearbox, dtype: int64
notRepairedDamage的特征分布如下：
notRepairedDamage特征有个2不同的值
1.0    176922
0.0     24542
Name: notRepairedDamage, dtype: int64
regionCode的特征分布如下：
regionCode特征有个8081不同的值
487     550
868     424
149     236
539     227
32      216...
7959      1
8002      1
6715      1
7117      1
4144      1
Name: regionCode, Length: 8081, dtype: int64

# 特征nunique分布
for cat_fea in categorical_features:print(cat_fea + "的特征分布如下：")print("{}特征有个{}不同的值".format(cat_fea, Test_data[cat_fea].nunique()))print(Test_data[cat_fea].value_counts())

name的特征分布如下：
name特征有个38668不同的值
73        98
821       89
243       77
451       74
826       73..
106879     1
108926     1
176509     1
178556     1
67583      1
Name: name, Length: 38668, dtype: int64
model的特征分布如下：
model特征有个249不同的值
0.0      3916
6.0      3496
1.0      2806
4.0      2802
12.0     1745...
247.0       2
246.0       2
214.0       1
243.0       1
232.0       1
Name: model, Length: 249, dtype: int64
brand的特征分布如下：
brand特征有个40不同的值
0     10697
4      5464
11     5374
10     4747
1      4390
6      3496
9      2408
5      1534
15     1325
12      929
7       782
3       736
17      732
13      679
8       666
28      645
19      534
18      487
16      458
22      430
14      416
23      397
24      390
25      297
20      293
27      265
29      236
34      206
30      133
21      121
2       101
38       92
31       87
35       76
36       73
26       72
32       70
37       61
33       61
39       40
Name: brand, dtype: int64
bodyType的特征分布如下：
bodyType特征有个8不同的值
7.0    12748
3.0    10808
4.0     9143
5.0     4175
6.0     3079
2.0     2484
1.0     1980
0.0      473
Name: bodyType, dtype: int64
fuelType的特征分布如下：
fuelType特征有个7不同的值
0.0    30045
5.0    14645
4.0      754
3.0       73
2.0       43
1.0       23
6.0       15
Name: fuelType, dtype: int64
gearbox的特征分布如下：
gearbox特征有个2不同的值
1.0    36935
0.0    10352
Name: gearbox, dtype: int64
notRepairedDamage的特征分布如下：
notRepairedDamage特征有个2不同的值
1.0    35555
0.0     4817
Name: notRepairedDamage, dtype: int64
regionCode的特征分布如下：
regionCode特征有个7078不同的值
487     122
868      93
539      46
32       46
222      46...
3761      1
6232      1
7891      1
2106      1
2246      1
Name: regionCode, Length: 7078, dtype: int64

numeric_features.append('price')
numeric_features

['power','kilometer','v_0','v_1','v_2','v_3','v_4','v_5','v_6','v_7','v_8','v_9','v_10','v_11','v_12','v_13','v_14','price']

## 1) 相关性分析
price_numeric = Train_data[numeric_features]
correlation = price_numeric.corr()
print(correlation['price'].sort_values(ascending = False),'\n')

price        1.000000
v_0          0.514477
v_11         0.481618
power        0.189456
v_8          0.183505
v_10         0.163891
v_12         0.129570
v_13         0.114883
v_7          0.090440
v_14         0.075673
v_4          0.004413
v_2         -0.018823
v_6         -0.036826
v_5         -0.039637
v_9         -0.165831
v_1         -0.207255
kilometer   -0.404961
v_3         -0.595468
Name: price, dtype: float64

f , ax = plt.subplots(figsize = (7, 7))plt.title('Correlation of Numeric Features with Price',y=1,size=16)sns.heatmap(correlation,square = True,  vmax=0.8)

<AxesSubplot:title={'center':'Correlation of Numeric Features with Price'}>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JwxFTkaQ-1618460688104)(output_50_1.png)]

data：矩阵数据集，可以使numpy的数组（array），如果是pandas的dataframe，则df的index/column信息会分别对应到heatmap的columns和rows
linewidths,热力图矩阵之间的间隔大小
vmax,vmin, 图例中最大值和最小值的显示值，没有该参数时默认不显示
cmap：matplotlib的colormap名称或颜色对象；如果没有提供，默认为cubehelix map (数据集为连续数据集时) 或 RdBu_r (数据集为离散数据集时)
center:将数据设置为图例中的均值数据，即图例中心的数据值；通过设置center值，可以调整生成的图像颜色的整体深浅；设置center数据时，如果有数据溢出，则手动设置的vmax、vmin会自动改变
annotate的缩写，annot默认为False，当annot为True时，在heatmap中每个方格写入数据
annot_kws，当annot为True时，可设置各个参数，包括大小，颜色，加粗，斜体字等
square：布尔值，可选参数。

如果为True，则将坐标轴方向设置为“equal”，以使每个单元格为方形。

del price_numeric['price']

## 2) 查看几个特征得 偏度和峰值
for col in numeric_features:print('{:15}'.format(col), 'Skewness: {:05.2f}'.format(Train_data[col].skew()) , '   ' ,'Kurtosis: {:06.2f}'.format(Train_data[col].kurt())  )

power           Skewness: 58.59     Kurtosis: 4473.89
kilometer       Skewness: -1.56     Kurtosis: 001.25
v_0             Skewness: -1.50     Kurtosis: 002.90
v_1             Skewness: 01.58     Kurtosis: 001.10
v_2             Skewness: 01.20     Kurtosis: 003.75
v_3             Skewness: 01.35     Kurtosis: 004.29
v_4             Skewness: 00.22     Kurtosis: 006.95
v_5             Skewness: 02.05     Kurtosis: 006.49
v_6             Skewness: 00.09     Kurtosis: -00.56
v_7             Skewness: 00.82     Kurtosis: -00.73
v_8             Skewness: -1.53     Kurtosis: 000.37
v_9             Skewness: 01.53     Kurtosis: 000.38
v_10            Skewness: -2.58     Kurtosis: 004.80
v_11            Skewness: -0.91     Kurtosis: 001.55
v_12            Skewness: -2.84     Kurtosis: 006.14
v_13            Skewness: -3.87     Kurtosis: 013.20
v_14            Skewness: 00.49     Kurtosis: -01.60
price           Skewness: 03.54     Kurtosis: 021.23

转换数据

df.melt() 是 df.pivot() 逆转操作函数

将列名转换为列数据(columns name → column values)，重构DataFrame

如果说 df.pivot() 将长数据集转换成宽数据集，df.melt() 则是将宽数据集变成长数据集

melt() 既是顶级类函数也是实例对象函数，作为类函数出现时，需要指明 DataFrame 的名称
参数类型说明
frame dataframe
被 melt 的数据集名称

在 pd.melt() 中使用

id_vars
tuple

list

ndarray

可选项

不需要被转换的列名，在转换后作为标识符列（不是索引列）

value_vars
tuple

list

ndarray

可选项

需要被转换的现有列

如果未指明，除 id_vars 之外的其他列都被转换

var_name string
variable 默认值

自定义列名名称

设置由 ‘value_vars’ 组成的新的 column name

value_name
string

value 默认值

自定义列名名称

设置由 ‘value_vars’ 的数据组成的新的 column name

col_level
int

string

可选项

如果列是MultiIndex，则使用此级别

seaborn.FacetGrid

data : DataFrame

处理后的（“长格式”）dataframe数据，其中每一列都是一个变量（特征），每一行都是一个样本

row, col, hue : strings

定义数据子集的变量，这些变量将在网格的不同方面绘制。请参阅下面*_order参数以控制该变量的级别顺序

例如：col=“sex”, hue=“smoker”,即列表示性别，颜色语意表示是否吸烟，下面示例会给出详细说明

col_wrap : int, optional

这个意思是图网格列维度限制，比如col_wrap =3，那么在这个画布里最多只能画3列。行不限制，这样就限制了列的个数。

share{x,y} : bool, ‘col’, or ‘row’ optional

是否共享x轴或者y轴，就是说如果为真，就共享同一个轴，否则就不共享，默认是都共享，即都为True

g = sns.FacetGrid(tips, col=“sex”, hue=“smoker”,sharex=True, sharey=True)# 都共享
g.map(plt.scatter, “total_bill”, “tip”, alpha=0.8)
g.add_legend();

map是python内置函数，会根据提供的函数对指定的序列做映射。

map()函数的格式是：

map(function,iterable,…)
第一个参数接受一个函数名，后面的参数接受一个或多个可迭代的序列，返回的是一个集合。

把函数依次作用在list中的每一个元素上，得到一个新的list并返回。注意，map不改变原list，而是返回一个新list。

## 3) 每个数字特征得分布可视化
f = pd.melt(Train_data, value_vars=numeric_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False)
g = g.map(sns.distplot, "value")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-y5qOGg1w-1618460688110)(output_57_0.png)]

## 4) 数字特征相互之间的关系可视化
sns.set()
columns = ['price', 'v_12', 'v_8' , 'v_0', 'power', 'v_5',  'v_2', 'v_6', 'v_1', 'v_14']
sns.pairplot(Train_data[columns],size = 2 ,kind ='scatter',diag_kind='kde')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Oa5cvdtd-1618460688111)(output_58_0.png)]

Train_data.columns

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType','gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode','creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6','v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15','v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23'],dtype='object')

Y_train

0           520
1          5500
2          1100
3          1200
4          3300...
249995     1200
249996     1200
249997    16500
249998    31950
249999     1990
Name: price, Length: 250000, dtype: int64

fig, ax = plt.subplots(1,3),其中参数1和3分别代表子图的行数和列数，一共有 1x3 个子图像。函数返回一个figure图像和子图ax的array列表。
fig, ax = plt.subplots(1,3,1),最后一个参数1代表第一个子图。
如果想要设置子图的宽度和高度可以在函数内加入figsize值
fig, ax = plt.subplots(1,3,figsize=(15,7))，这样就会有1行3个15x7大小的子图。

## 5) 多变量互相回归关系可视化
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8), (ax9, ax10)) = plt.subplots(nrows=5, ncols=2, figsize=(24, 20))
# ['v_12', 'v_8' , 'v_0', 'power', 'v_5',  'v_2', 'v_6', 'v_1', 'v_14']
v_12_scatter_plot = pd.concat([Y_train,Train_data['v_12']],axis = 1)
'''
objs: series，dataframe或者是panel构成的序列lsit
axis： 需要合并链接的轴，0是行，1是列
join：连接的方式 inner，或者outer
'''
sns.regplot(x='v_12',y = 'price', data = v_12_scatter_plot,scatter= True, fit_reg=True, ax=ax1)v_8_scatter_plot = pd.concat([Y_train,Train_data['v_8']],axis = 1)
sns.regplot(x='v_8',y = 'price',data = v_8_scatter_plot,scatter= True, fit_reg=True, ax=ax2)v_0_scatter_plot = pd.concat([Y_train,Train_data['v_0']],axis = 1)
sns.regplot(x='v_0',y = 'price',data = v_0_scatter_plot,scatter= True, fit_reg=True, ax=ax3)power_scatter_plot = pd.concat([Y_train,Train_data['power']],axis = 1)
sns.regplot(x='power',y = 'price',data = power_scatter_plot,scatter= True, fit_reg=True, ax=ax4)v_5_scatter_plot = pd.concat([Y_train,Train_data['v_5']],axis = 1)
sns.regplot(x='v_5',y = 'price',data = v_5_scatter_plot,scatter= True, fit_reg=True, ax=ax5)v_2_scatter_plot = pd.concat([Y_train,Train_data['v_2']],axis = 1)
sns.regplot(x='v_2',y = 'price',data = v_2_scatter_plot,scatter= True, fit_reg=True, ax=ax6)v_6_scatter_plot = pd.concat([Y_train,Train_data['v_6']],axis = 1)
sns.regplot(x='v_6',y = 'price',data = v_6_scatter_plot,scatter= True, fit_reg=True, ax=ax7)v_1_scatter_plot = pd.concat([Y_train,Train_data['v_1']],axis = 1)
sns.regplot(x='v_1',y = 'price',data = v_1_scatter_plot,scatter= True, fit_reg=True, ax=ax8)v_14_scatter_plot = pd.concat([Y_train,Train_data['v_14']],axis = 1)
sns.regplot(x='v_14',y = 'price',data = v_14_scatter_plot,scatter= True, fit_reg=True, ax=ax9)v_13_scatter_plot = pd.concat([Y_train,Train_data['v_13']],axis = 1)
sns.regplot(x='v_13',y = 'price',data = v_13_scatter_plot,scatter= True, fit_reg=True, ax=ax10)
'''
sns.regplot()的用法
参数说明x,y：就是x,y轴的值data：x,y所属的dfx_estimator：将此函数应用于x的每个唯一值并绘制结果估计值。当x是离散变量时，这很有用。如果给定x_ci，则此估计值将自举并绘制置信区间x_bins：将x分成多少段其他的参数可以参考官网文档：https://www.cntofu.com/book/172/docs/28.md
sns.regplot()：绘图数据和线性回归模型拟合'''

<AxesSubplot:xlabel='v_13', ylabel='price'>

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gY8sdLxP-1618460688115)(output_62_1.png)]

2、pd.qcut函数，按照数据出现频率百分比划分，比如要把数据分为四份，则四段分别是数据的0-25%，25%-50%，50%-75%，75%-100%
pd.qcut(x, q, labels=None, retbins=False, precision=3, duplicates=‘raise’)

1、pd.cut函数有7个参数，主要用于对数据从最大值到最小值进行等距划分

pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False)

参数：

x : 输入待cut的一维数组

bins : cut的段数，一般为整型，但也可以为序列向量。

right : 布尔值，确定右区间是否开闭，取True时右区间闭合

labels : 数组或布尔值，默认为None，用来标识分后的bins，长度必须与结果bins相等，返回值为整数或者对bins的标识

retbins : 布尔值，可选。是否返回数值所在分组，Ture则返回

precision : 整型，bins小数精度，也就是数据以几位小数显示

include_lowest : 布尔类型，是否包含左区间

## 1) unique分布
for fea in categorical_features:print(Train_data[fea].nunique())

categorical_features

['name','model','brand','bodyType','fuelType','gearbox','notRepairedDamage','regionCode']

## 2) 类别特征箱形图可视化# 因为 name和 regionCode的类别太稀疏了，这里我们把不稀疏的几类画一下
categorical_features = ['model','brand','bodyType','fuelType','gearbox','notRepairedDamage']
for c in categorical_features:Train_data[c] = Train_data[c].astype('category')if Train_data[c].isnull().any():'''1.np.array.any()和numpy.array.all()np.array.any()是或操作，任意一个元素为True，输出为True。np.array.all()是与操作，所有元素为True，输出为True。'''Train_data[c] = Train_data[c].cat.add_categories(['MISSING'])Train_data[c] = Train_data[c].fillna('MISSING')def boxplot(x, y, **kwargs):sns.boxplot(x=x, y=y)x=plt.xticks(rotation=90)f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(boxplot, "value", "price")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jnQPwPid-1618460688125)(output_66_0.png)]

Train_data.columns

Index(['SaleID', 'name', 'regDate', 'model', 'brand', 'bodyType', 'fuelType','gearbox', 'power', 'kilometer', 'notRepairedDamage', 'regionCode','creatDate', 'price', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6','v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14', 'v_15','v_16', 'v_17', 'v_18', 'v_19', 'v_20', 'v_21', 'v_22', 'v_23'],dtype='object')

## 3) 类别特征的小提琴图可视化
catg_list = categorical_features
target = 'price'
for catg in catg_list :sns.violinplot(x=catg, y=target, data=Train_data)plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-voCMBxYk-1618460688127)(output_68_0.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Uo7yvKkg-1618460688129)(output_68_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-kuVLfUzR-1618460688133)(output_68_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-kcUEuAkJ-1618460688134)(output_68_3.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QCkMQ3bM-1618460688136)(output_68_4.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Jh7J6swE-1618460688140)(output_68_5.png)]

categorical_features = ['model','brand','bodyType','fuelType','gearbox','notRepairedDamage']

## 4) 类别特征的柱形图可视化
def bar_plot(x, y, **kwargs):sns.barplot(x=x, y=y)x=plt.xticks(rotation=90)f = pd.melt(Train_data, id_vars=['price'], value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(bar_plot, "value", "price")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-o3RO6hfI-1618460688142)(output_70_0.png)]

##  5) 类别特征的每个类别频数可视化(count_plot)
def count_plot(x,  **kwargs):sns.countplot(x=x)x=plt.xticks(rotation=90)f = pd.melt(Train_data,  value_vars=categorical_features)
g = sns.FacetGrid(f, col="variable",  col_wrap=2, sharex=False, sharey=False, size=5)
g = g.map(count_plot, "value")

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1ugoBEPk-1618460688143)(output_71_0.png)]

import pandas_profiling

pfr = pandas_profiling.ProfileReport(Train_data)
pfr.to_file("./example.html")

Summarize dataset:   0%|          | 0/51 [00:00<?, ?it/s]

二手车Task2 数据分析相关推荐

【算法竞赛学习】二手车交易价格预测-Task2数据分析
二手车交易价格预测-Task2 数据分析二. EDA-数据探索性分析 Tip:此部分为零基础入门数据挖掘的 Task2 EDA-数据探索性分析部分,带你来了解数据,熟悉数据,和数据做朋友,欢迎大家 ...
二手车数据分析-task2数据分析
零基础入门数据挖掘-Task2 数据分析 Datawhale 二手车数据挖掘-Task2数据分析一. EDA-Exploratory Data Analysis-数据探索性分析二. 相关任务三. ...
【算法竞赛学习】气象海洋预测-Task2 数据分析
气象海洋预测-Task2 数据分析数据分析是我们解决一个数据挖掘任务的重要一环,通过数据分析,我们可以了解标签的分布.数据中存在的缺失值和异常值.特征与标签之间的相关性.特征之间的相关性等,并根据数 ...
【算法竞赛学习】数字中国创新大赛智慧海洋建设-Task2数据分析
智慧海洋建设-Task2 数据分析此部分为智慧海洋建设竞赛的数据分析模块,通过数据分析,可以熟悉数据,为后面的特征工程做准备,欢迎大家后续多多交流. 赛题:智慧海洋建设数据分析的目的: EDA的主 ...
贷款违约预测-Task2 数据分析
Task2 数据分析此部分为零基础入门金融风控的 Task2 数据分析部分,带你来了解数据,熟悉数据,为后续的特征工程做准备,欢迎大家后续多多交流. 赛题:零基础入门数据挖掘 - 零基础入门金融风控 ...
Datawhale 零基础入门数据挖掘-Task2 数据分析
数据探索在机器学习中我们一般称为EDA(Exploratory Data Analysis):是指对已有的数据(特别是调查或观察得来的原始数据)在尽量少的先验假定下进行探索,通过作图.制表.方程拟合. ...
基于python的二手车网站数据分析与可视化
开发工具(eclipse/idea/vscode等):pychram 数据库(sqlite/mysql/sqlserver等):sqlite或者mysql 功能模块(请用文字描述,至少200字): 1 ...
Datawhale学习笔记【阿里云天池金融风控-贷款违约预测】Task2 数据分析
阿里云天池学习赛[金融风控-贷款违约预测] 赛题数据及背景 python库的导入国内镜像源网址及使用方法镜像使用方法文件读取数据的总体了解查看数据集中特征缺失值,唯一值等检查缺失值缺失值 ...
Task2 数据分析
数据探索性分析 #导入warnings包,利用过滤器来实现忽略警告语句. import warnings warnings.filterwarnings('ignore')import pandas ...

二手车Task2 数据分析

2) 查看skewness偏度和 kurtosis峰度

偏度: 是描述数据分布形态的统计量，其描述的是某总体取值分布的对称性，简单来说就是数据的不对称程度,绝对值越大表明数据分布越不对称，偏斜程度大

峰度: 描述某变量所有取值分布形态陡缓程度的统计量，简单来说就是数据分布顶的尖锐程度(>0尖顶峰, <0平顶峰, =0与正态分布陡峭程度一致)

plt.hist绘制直方图参数density 为True和False分别代表是否归一化参数orientation决定了是采用纵轴代表频率还是横轴代表频率的展现形式

seaborn.FacetGrid

二手车Task2 数据分析相关推荐

最新文章

热门文章

二手车Task2 数据分析

2) 查看skewness偏度 和 kurtosis峰度

偏度: 是描述数据分布形态的统计量，其描述的是某总体取值分布的对称性，简单来说就是数据的不对称程度,绝对值越大表明数据分布越不对称，偏斜程度大

峰度: 描述某变量所有取值分布形态陡缓程度的统计量，简单来说就是数据分布顶的尖锐程度(>0尖顶峰, <0平顶峰, =0与正态分布陡峭程度一致)

plt.hist绘制直方图参数density 为True和False分别代表是否归一化 参数orientation决定了是采用纵轴代表频率还是横轴代表频率的展现形式

seaborn.FacetGrid

二手车Task2 数据分析相关推荐

最新文章

热门文章

2) 查看skewness偏度和 kurtosis峰度

plt.hist绘制直方图参数density 为True和False分别代表是否归一化参数orientation决定了是采用纵轴代表频率还是横轴代表频率的展现形式