
1. 数据说明

  • User_ID:用户ID Gender:
  • 性别,M为男性,F为女性
  • Age:年龄段,划分为0-17、18-25、26-35、36-45、46-55、55+共六个年龄段
  • Occupation:职业,已转换为数字标签,共有21类职业
  • Stay_In_Current_City_Years:所在城市居住年份,分为0、1、2、3、4+五个类别
  • Marital_Status:婚姻状况,0为未婚,1为已婚
  • 件数:本次消费所购买的商品数目
  • 消费总额:该用户本次消费所支出的总金额,单位为美元


Starting Rserve…
[1] “F:\新建文件夹 (6)\黑色星期五\book233用户信息.csv”
user=read.csv(“F:\新建文件夹 (6)\黑色星期五\book233用户信息.csv”)`
‘data.frame’: 1047 obs. of 8 variables:
$ User_ID : int 1000001 1000003 1000005 1000006 1000015 1000019 1000020 1000022 1000024 1000033 …
$ Gender : Factor w/ 3 levels “”,“F”,“M”: 2 3 3 2 3 3 3 3 2 3 …
$ Age : Factor w/ 8 levels “”,“0-17”,“18-25”,…: 2 4 4 7 4 2 4 3 4 6 …
$ Occupation : int 10 15 20 9 7 10 14 15 7 3 …
$ Stay_In_Current_City_Years: Factor w/ 6 levels “”,“0”,“1”,“2”,…: 4 5 3 3 3 5 2 6 5 3 …
$ Marital_Status : int 0 0 1 0 0 0 0 0 1 1 …
$ 件数 : int 34 29 106 46 116 144 12 155 76 215 …
$ 消费总额 : int 333481 341635 821001 379450 1047124 1457938 185747 1279678 720850 1940043 …

2. 数据预处理

(1) 删除第一列的用户ID,在利用用户个人信息对用户消费总额进行拟合的过程中,用户ID显然是不能作为自变量的。(唯一属性并不能描述事件本身的分布规律)

> #install.packages("mice")
> library("mice")
> md.pattern(user)  #缺失值检测Gender Age Stay_In_Current_City_Years 件数 消费总额 User_ID Occupation Marital_Status
1045      1   1                          1    1        1       1          1              1 01      1   1                          1    1        1       0          0              0 31      1   1                          1    0        0       0          0              0 50   0                          0    1        1       2          2              2 8
> users=na.omit(user)#缺失值删除
> md.pattern(users)User_ID Gender Age Occupation Stay_In_Current_City_Years Marital_Status 件数 消费总额
[1,]       1      1   1          1                          1              1    1        1 0
[2,]       0      0   0          0                          0              0    0        0 0
> #可以看到已经没有缺失值啦
> users_1=users[,-7]
> users_12=users_1[,-1]
> #删除1、7 列
> str(users_12)
'data.frame': 1045 obs. of  6 variables:$ Gender                    : Factor w/ 3 levels "","F","M": 2 3 3 2 3 3 3 3 2 3 ...$ Age                       : Factor w/ 8 levels "","0-17","18-25",..: 2 4 4 7 4 2 4 3 4 6 ...$ Occupation                : int  10 15 20 9 7 10 14 15 7 3 ...$ Stay_In_Current_City_Years: Factor w/ 6 levels "","0","1","2",..: 4 5 3 3 3 5 2 6 5 3 ...$ Marital_Status            : int  0 0 1 0 0 0 0 0 1 1 ...$ 消费总额                  : int  333481 341635 821001 379450 1047124 1457938 185747 1279678 720850 1940043 ...


> users_12$Occupation= as.factor(users_12$Occupation)
> users_12$Marital_Status= as.factor(users_12$Marital_Status)
> str(users_12)
'data.frame': 1045 obs. of  6 variables:$ Gender                    : Factor w/ 3 levels "","F","M": 2 3 3 2 3 3 3 3 2 3 ...$ Age                       : Factor w/ 8 levels "","0-17","18-25",..: 2 4 4 7 4 2 4 3 4 6 ...$ Occupation                : Factor w/ 21 levels "0","1","2","3",..: 11 16 21 10 8 11 15 16 8 4 ...$ Stay_In_Current_City_Years: Factor w/ 6 levels "","0","1","2",..: 4 5 3 3 3 5 2 6 5 3 ...$ Marital_Status            : Factor w/ 2 levels "0","1": 1 1 2 1 1 1 1 1 2 2 ...$ 消费总额                  : int  333481 341635 821001 379450 1047124 1457938 185747 1279678 720850 1940043 ...> boxplot(users_12$ 消费总额 ,col="yellow")  #箱型图查看缺失值
> boxplot.stats(users_12$ 消费总额)
[1]   45551  281780  730131 1672669 3737504
[1] 1045
[1] 662149.4 798112.6
$out[1]  4355777  6573609  5212846  6310604  4997527  4647555  3917492  4681205  4255176  4054112
[11]  5499812  3770941  6511302  3786677  4003012  5628295  4728932  6387899  4178546  3888766
[21]  5805353  4503530  5136424  5103795  3977702  4055317  8699232  4358776  3797112  6817493
[31]  5549841  5166938  4433272  4135916  4032859  7577505  4303859  6126540  4453785  5673106
[41]  3955182  6476786  4028509  4528519  6186498  5961987  4384924  4664260  5153189  4622308
[51]  6044178  4152683  4094730  3847749  4836540 10536783  4256751  5733683  6565878  4006176
[61]  5129726  5150348  4642305  4689382  4174884  4458155  3824963  4098692  4246978  5075337
[71]  5985405  4354802
> #建立训练集与测试集
> ind  = sample(2,nrow(users_12),replace = TRUE,prob=c(0.7,0.3))
> train=users_12[ind==1,]
> test=users_12[ind==2,]

3. 通过多元线性回归预测用户消费总额

> set.seed((12))
> users_lm=lm(消费总额~Gender+Age+Occupation+Stay_In_Current_City_Years,data=train)
> users_lmCall:
lm(formula = 消费总额 ~ Gender + Age + Occupation + Stay_In_Current_City_Years, data = train)Coefficients:(Intercept)                       GenderM                      Age18-25  1112974                        330122                       -204918  Age26-35                      Age36-45                      Age46-50  3245                       -195851                       -274017  Age51-55                        Age55+                   Occupation1  -573862                       -857404                        152753  Occupation2                   Occupation3                   Occupation4  133586                        277339                        190566  Occupation5                   Occupation6                   Occupation7  69889                        792490                        145483  Occupation8                   Occupation9                  Occupation10  -604240                       -469454                       -203505  Occupation11                  Occupation12                  Occupation13  207628                       -313140                       -330398  Occupation14                  Occupation15                  Occupation16  341738                         36168                        922765  Occupation17                  Occupation18                  Occupation19  -83717                        280807                         78211  Occupation20   Stay_In_Current_City_Years1   Stay_In_Current_City_Years2  652190                       -208613                        -33459  Stay_In_Current_City_Years3  Stay_In_Current_City_Years4+  -185424                       -208820  > summary(users_lm)Call:
lm(formula = 消费总额 ~ Gender + Age + Occupation + Stay_In_Current_City_Years, data = train)Residuals:Min       1Q   Median       3Q      Max
-1933294  -833549  -358080   413600  8366773 Coefficients:Estimate Std. Error t value Pr(>|t|)
(Intercept)                   1112974     534216   2.083  0.03758 *
GenderM                        330122     112565   2.933  0.00347 **
Age18-25                      -204918     499728  -0.410  0.68189
Age26-35                         3245     501646   0.006  0.99484
Age36-45                      -195851     510165  -0.384  0.70117
Age46-50                      -274017     540614  -0.507  0.61241
Age51-55                      -573863     526379  -1.090  0.27600
Age55+                        -857404     557787  -1.537  0.12471
Occupation1                    152753     210666   0.725  0.46864
Occupation2                    133586     247114   0.541  0.58897
Occupation3                    277339     298621   0.929  0.35335
Occupation4                    190566     196994   0.967  0.33370
Occupation5                     69890     457117   0.153  0.87853
Occupation6                    792490     352716   2.247  0.02496 *
Occupation7                    145483     203931   0.713  0.47584
Occupation8                   -604240     662511  -0.912  0.36206
Occupation9                   -469454     664999  -0.706  0.48046
Occupation10                  -203505     513061  -0.397  0.69175
Occupation11                   207628     364321   0.570  0.56893
Occupation12                  -313140     230735  -1.357  0.17517
Occupation13                  -330398     541278  -0.610  0.54179
Occupation14                   341738     262857   1.300  0.19400
Occupation15                    36168     316049   0.114  0.90892
Occupation16                   922765     297426   3.103  0.00200 **
Occupation17                   -83717     248834  -0.336  0.73664
Occupation18                   280807     594842   0.472  0.63703
Occupation19                    78211     436344   0.179  0.85780
Occupation20                   652190     235661   2.767  0.00580 **
Stay_In_Current_City_Years1   -208613     149594  -1.395  0.16360
Stay_In_Current_City_Years2    -33459     171450  -0.195  0.84533
Stay_In_Current_City_Years3   -185424     174960  -1.060  0.28960
Stay_In_Current_City_Years4+  -208820     178018  -1.173  0.24119
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1Residual standard error: 1288000 on 697 degrees of freedom
Multiple R-squared:  0.08683,   Adjusted R-squared:  0.04622
F-statistic: 2.138 on 31 and 697 DF,  p-value: 0.0003726> lm_predictions= predict(users_lm, test)
> #install.packages("gmodels")
> library("gmodels")
> #计算相对误差
> w_lm=mean((lm_predictions-test$消费总额)^2)/ mean((mean(test$消费总额)- test$消费总额)^2)
> w_lm
[1] 1.007764
> w__lm=mean((lm_predictions-test$消费总额)^2)
> w__lm
[1] 2.127004e+12
> plot(lm_predictions,test$消费总额)

其中T检验的结果还是可以的,自变量都是非常显著,p-value同样也是非常显著,但是调整后的R-squared: 0.04622 ,非常不佳,说明自变量对于因变量的解释率仅为4.6%

4. 通过随机森林预测用户消费总额

randomForest(x, y=NULL, ntree=500,importance=FALSE,localImp=FALSE, nPerm=1,mtry=3, proximity)


> #随机森林
> set.seed(123)
> library("randomForest")
> users_tree=randomForest(消费总额~.,data=train,importance=TRUE,ntree=100)
> print(users_tree)Call:randomForest(formula = 消费总额 ~ ., data = train, importance = TRUE,      ntree = 100) Type of random forest: regressionNumber of trees: 100
No. of variables tried at each split: 1Mean of squared residuals: 1.720726e+12% Var explained: 0.96
> importance((users_tree))%IncMSE IncNodePurity
Gender                     4.6765780  1.445564e+13
Age                        4.0652253  4.626723e+13
Occupation                 2.7830964  9.723711e+13
Stay_In_Current_City_Years 2.3300941  3.103846e+13
Marital_Status             0.7702007  1.082766e+13
> tree_predictions= predict(users_tree, test)
> t_lm=mean((tree_predictions-test$消费总额)^2)/ mean((mean(test$消费总额)- test$消费总额)^2)
> t_lm
[1] 1.001825
> t__lm=mean((tree_predictions-test$消费总额)^2)
> t__lm
[1] 2.114469e+12> #改变决策森林中树的数目
> users_tree_2=randomForest(消费总额~.,data=train,importance=TRUE,ntree=1000)
> print(users_tree_2)Call:randomForest(formula = 消费总额 ~ ., data = train, importance = TRUE,      ntree = 1000) Type of random forest: regressionNumber of trees: 1000
No. of variables tried at each split: 1Mean of squared residuals: 1.720764e+12% Var explained: 0.95
> importance((users_tree_2))%IncMSE IncNodePurity
Gender                     9.670613  1.709002e+13
Age                        6.188277  4.585156e+13
Occupation                 7.528485  1.080333e+14
Stay_In_Current_City_Years 3.679895  3.365439e+13
Marital_Status             1.603983  9.651066e+12
> tree_2_predictions= predict(users_tree_2, test)
> t2_lm=mean((tree_2_predictions-test$消费总额)^2)/ mean((mean(test$消费总额)- test$消费总额)^2)
> t2_lm
[1] 1.001088
> t2__lm=mean((tree_2_predictions-test$消费总额)^2)
> t2__lm
[1] 2.112913e+12
> #改变决策森林中树的数目
> users_tree_3=randomForest(消费总额~.,data=train,importance=TRUE,ntree=5)
> print(users_tree_3)Call:randomForest(formula = 消费总额 ~ ., data = train, importance = TRUE,      ntree = 5) Type of random forest: regressionNumber of trees: 5
No. of variables tried at each split: 1Mean of squared residuals: 1.999001e+12% Var explained: -15.06
> importance((users_tree_3))%IncMSE IncNodePurity
Gender                      2.1048620  2.034920e+13
Age                         2.0914720  7.205510e+13
Occupation                 -0.1595652  1.474722e+14
Stay_In_Current_City_Years  0.3312179  4.161017e+13
Marital_Status              0.1408100  2.436103e+13
> tree_3_predictions= predict(users_tree_3, test)
> t3_lm=mean((tree_3_predictions-test$消费总额)^2)/ mean((mean(test$消费总额)- test$消费总额)^2)
> t3_lm
[1] 1.0148
> t3__lm=mean((tree_3_predictions-test$消费总额)^2)
> t3__lm
[1] 2.141854e+12


树的数目 % Var explained: 测试集相对误差
100 0.96 1.001825
1000 0.95 1.001088
5 -15.06 1.0148

这里一个重要的系数是% Var explained,称为拟合优度,它的作用类似于之前回归分析中的R方。






> #加入平方项
> set.seed((12))
> users_jx_lm=lm(消费总额~Gender+Age+Occupation+Stay_In_Current_City_Years+Gender**2+Age**2+Occupation**2+Stay_In_Current_City_Years**2+Gender*Age+Occupation*Stay_In_Current_City_Years,data=train)
> summary(users_jx_lm)Call:
lm(formula = 消费总额 ~ Gender + Age + Occupation + Stay_In_Current_City_Years + Gender^2 + Age^2 + Occupation^2 + Stay_In_Current_City_Years^2 + Gender * Age + Occupation * Stay_In_Current_City_Years, data = train)
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1Residual standard error: 1259000 on 620 degrees of freedom
Multiple R-squared:  0.2238,    Adjusted R-squared:  0.08856
F-statistic: 1.655 on 108 and 620 DF,  p-value: 0.000131#加入交叉项和平方项
> w__jx_lm=mean((lm_jx_predictions-test$消费总额)^2)
> users_jx_lm=lm(消费总额~Gender+Age+Occupation+Stay_In_Current_City_Years+Gender**2+Age**2+Occupation**2+Stay_In_Current_City_Years**2+Gender*Age+Occupation*Stay_In_Current_City_Years*Gender+Age*Occupation+Stay_In_Current_City_Years*Age,data=train)
> summary(users_jx_lm)Call:
lm(formula = 消费总额 ~ Gender + Age + Occupation + Stay_In_Current_City_Years + Gender^2 + Age^2 + Occupation^2 + Stay_In_Current_City_Years^2 + Gender * Age + Occupation * Stay_In_Current_City_Years * Gender + Age * Occupation + Stay_In_Current_City_Years * Age, data = train)Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1Residual standard error: 1296000 on 472 degrees of freedom
Multiple R-squared:  0.3745,    Adjusted R-squared:  0.03519
F-statistic: 1.104 on 256 and 472 DF,  p-value: 0.1805> lm_jx_predictions= predict(users_jx_lm, test)
Warning message:
In predict.lm(users_jx_lm, test) : 用秩缺乏拟合来进行预测的结果很可能不可靠
> #计算相对误差
> w_jx_lm=mean((lm_jx_predictions-test$消费总额)^2)/ mean((mean(test$消费总额)- test$消费总额)^2)
> w_jx_lm
[1] 1.491868
> w__jx_lm=mean((lm_jx_predictions-test$消费总额)^2)
> w__jx_lm
[1] 3.148763e+12


