chapter 9

餐饮企业综合分析

统计分析>>ARIMA预测销售额>>协同过滤算法对菜品进行智能推荐>>Apriori算法对菜品进行关联分析>>K-means算法进行客户价值分析>>决策树算法进行客户流失预测
这几种算法是不同从不同方面得到不同的结果的独立模块，根据算法的要求，对数据进行相关的数据预处理

9.2 任务实现

统计餐饮数据

# 【分组聚合：使用aggregate或者split-lapply-cbind模式】# 设置工作目录
setwd()
# [使用aggregate函数进行分组统计]
# 导入数据
score <- read.csv("./data/score.csv", stringsAsFactors = FALSE)
# 提取score中的gender字段
gender <- list(score$gender)
# 对score1和score2列进行分组统计
aggregate(score[, c(2,3)], gender, mean)
# [使用split-lapply-cbind模式进行分组统计]
# 分组
sp <- split(score, score$gender, drop = TRUE)
# 求score1和score2的均值
score1 <- lapply(sp, FUN = function(x) mean(x$score1))
score2 <- lapply(sp, FUN = function(x) mean(x$score2))
# 合并分组计算结果
result <- cbind(score1, score2 )
result# 【使用melt和dcast这两个函数实现透视表功能】
library(reshape2)
data(airquality)
airquality <- airquality[29:34, ]
names(airquality) <- tolower(names(airquality))
airquality# 【用melt转换数据格式】
md <- melt(airquality, id = c("month", "day"))
md
# 还原melt后的数据
cd <- dcast(md, month + day ~ variable)
cd# 设置工作目录
setwd()
# 【统计每日用餐人数与销售额】
info <- read.csv("./data/meal_order_info.csv")  # 导入数据info$use_start_time <- as.Date(info$use_start_time)  # 转换时间格式
table(info$order_status)  # 查看订单状态
info <- info[which(info$order_status == 1), ]  # 提取订单状态为1的数据# 【统计每日用餐人数与营业额】
sale <- aggregate(info[, c(3, 9)], list(info$use_start_time), sum)
colnames(sale) <- c("date", "number", "saleroom")  # 修改列名# 【导出每日的用餐人数和销售额】
write.csv(sale, "./tmp/sale_day.csv", row.names = FALSE)# 【每日用餐人数折线图】
sale$date <- as.POSIXct(sale$date)  # 将date字段转换时间格式
plot(sale$date, sale$number, col = "orange", type="b", xlab = "日期", ylab = "用餐人数")
# 画出每日营业额的折线图
plot(sale$date, sale$saleroom, col = "blue", type="o", xlab = "日期", ylab = "营业额")# 【计算菜品的热销度度】
# 导入数据
detail <- read.csv("./data/meal_order_detail.csv", stringsAsFactors = FALSE)# 菜品名称删除回车符和空格
head(detail$dishes_name)  # 查看前6个菜品名称
detail$dishes_name <- gsub("\\s|\\n+", "", detail$dishes_name)# 求出每个菜品的销售量
sales_volume <- aggregate(detail$counts, list(detail$dishes_name), sum)
colnames(sales_volume) <- c("dishes_name", "counts")# 求出每个菜品的热销度
sales_formula <- (sales_volume$count - min(sales_volume$count)) /(max(sales_volume$count) - min(sales_volume$count))
sales_volume$sales_hot <- round(sales_formula, 3)  # 保留3位小数# 查看热销度最高和最低的菜品
sales_volume[which(sales_volume$sales_hot == max(sales_volume$sales_hot)), ]
sales_volume[which(sales_volume$sales_hot == min(sales_volume$sales_hot)), ]# 画出热销度最高的前10个菜品的条形图
# 按热销度进行排序
sales_volume <- sales_volume[order(sales_volume$sales_hot, decreasing = TRUE), ]
barplot(sales_volume[1:10, 3], names.arg = sales_volume[1:10, 1], xlab = "菜品名称", ylab = "热销度", col = "blue")
write.csv(sales_volume, "./tmp/sales_volume.csv", row.names = FALSE)  # 导出数据# 【计算菜品毛利率】
# 导入数据
dish <- read.csv("./data/meal_dishes_detail.csv", stringsAsFactors = FALSE)
dish <- dish[, c(1, 3, 4, 8, 14)]  # 特征选取
# 菜品名称删除回车符
head(dish$dishes_name)
dish$dishes_name <- gsub("\\s|\\n+", "", dish$dishes_name)# 根据毛利率计算公式求出毛利率，并保留两位小数
dish$rate <- round((dish$price - dish$cost) / (dish$price), 2)# 找出毛利率最高和最低的菜品
dish[which(dish$rate == max(dish$rate)), ]
dish[which(dish$rate == min(dish$rate)), ]
write.csv(dish, "./tmp/profit.csv", row.names = FALSE)  # 导出数据

构建ARIMA模型

# 设置工作目录并读取数据
# 【检验平稳性和纯随机性】
setwd()
sale <- read.csv("./tmp/sale_day.csv")saleroom <- ts(sale[1:28, 3])
plot(saleroom, xlab = "时间", ylab = "销售额")  # 绘制时序图
acf(saleroom, lag.max = 30)  # 绘制ACF图# 差分
saleroom.diff <- diff(saleroom, differences = 2)  # 进行差分
acf(saleroom.diff, lag.max = 30)  # 绘制差分后序列的ACF图
# 单位根检验
library(tseries)
adf.test(saleroom)
Box.test(saleroom, type = "Ljung-Box")  # 纯随机性检验

# 【构建模型】
# BIC图
library(TSA)
# 原序列定阶
saleroom.BIC <- armasubsets(y = saleroom, nar = 5, nma = 5)
plot(saleroom.BIC)
# 差分后的序列定阶
saleroom.diff.BIC <- armasubsets(y = saleroom.diff, nar = 5, nma = 5)
plot(saleroom.diff.BIC)# 根据BIC图定阶
library(forecast)
# 初始化
checkout <- data.frame(p = 0, d = 0, q = 0, P = 0, D = 0, Q = 0, "残差P值" = 0, "平均误差" = 0)
test_checkout <- data.frame(p = 0, d = 0, q = 0, P = 0, D = 0, Q = 0, "残差P值" = 0, "平均误差" = 0)
j <- 1test_model <- function(p, q, P, Q){model <- Arima(saleroom, order = c(p, 0, q),seasonal = list(order = c(P, 2, Q), period = 7))result <- Box.test(model$residuals, type = "Ljung-Box")# 预测sale.forecast <- forecast(model, h = 3, level = c(99.5))# 计算平均绝对百分误差error <- abs(as.numeric(sale.forecast[[4]]) - sale[29:31,3]) / sale[29:31,3]p.value <- round(result$p.value, 4)print(paste('p=', p, ';q=', q, ';P=', P,',Q=', Q, ';残差P值:',p.value, ';平均误差:', mean(error), collapse = ""))test_checkout[1,1] <- ptest_checkout[1,2] <- 0test_checkout[1,3] <- qtest_checkout[1,4] <- Ptest_checkout[1,5] <- 2test_checkout[1,6] <- Qtest_checkout[1,7] <- round(result$p.value, 4)test_checkout[1,8] <- mean(error) return(test_checkout)
}for (p in c(0,3,4,5)) {if (p == 0 | p == 3) {for (q in 1:5) {for (P in c(0,1)) {for (Q in c(1,2,3,5)) {test_checkout <- test_model(p, q, P, Q)checkout[j, ] <- test_checkout[1, ]j <- j + 1}}}}if (p == 4) {for (q in 1:5) {if (q == 1) {for (Q in c(1,2,3,5)) {test_checkout <- test_model(p, q, 1, Q)checkout[j, ] <- test_checkout[1, ]j <- j + 1}}if (q != 1) {for (Q in c(1,2,3,5)) {test_checkout <- test_model(p, q, 0, Q)checkout[j, ] <- test_checkout[1, ]j <- j + 1}}}}if (p == 5) {for (q in 1:5) {for (Q in c(1,2,3,5)) {test_checkout <- test_model(p, q, 0, Q)checkout[j, ] <- test_checkout[1, ]j <- j + 1}}}
}
write.csv(checkout, "./tmp/checkout.csv", row.names = F)  # 导出每个模型的结果# 取最优模型预测
model <- Arima(saleroom, order = c(0,0,1), seasonal = list(order = c(0,2,2), period = 7))
summary(model)Box.test(model$residuals, type = "Ljung-Box")  # 纯随机性检验# 预测未来3天的销售额
sale.forecast <- forecast(model, h = 3, level = c(99.5))
plot(sale.forecast)# 计算平均误差
error <- abs(as.numeric(sale.forecast[[4]]) - sale[29:31,3]) / sale[29:31,3]
mean(error)

使用协同过滤算法实现菜品的智能推荐

# 设置工作目录并读取数据
setwd()
info <- read.csv("./data/meal_order_info.csv", stringsAsFactors = FALSE)
detail <- read.csv("./data/meal_order_detail.csv", stringsAsFactors = FALSE)# 数据预处理
# 菜品名称删除回车符和空格
head(detail$dishes_name)
detail$dishes_name <- gsub("\\s|\\n+", "", detail$dishes_name)
# 删除白饭的记录
detail <- detail[-which(detail$dishes_name == "白饭/小碗" | detail$dishes_name == "白饭/大碗"), ]# 查看订单状态
table(info$order_status)
# 统计订单状态为0或2的订单占比
info.id <- info[which(info$order_status == 0 | info$order_status == 2), "info_id"]
proportion <- length(info.id) / nrow(info)
proportion
# 删除detail数据中无意义的订单
detail <- detail[-which(detail$order_id %in% info.id), ]
# 提取订单状态为1的数据
info <- info[which(info$order_status == 1), ]# 特征选取
info <- info[, c(1:3, 8:10, 12, 19:21)]
detail <- detail[, c(1:3, 6, 8, 9, 11, 19)]# 写出数据归约后的订单表和订单详情表
write.csv(info, "./tmp/info_clear.csv", row.names = FALSE)
write.csv(detail, "./tmp/detail_clear.csv", row.names = FALSE)# 【构建两个推荐模型】
# 基于物品的协同过滤
require(recommenderlab)
# 将用户ID和菜品名称转换为0-1二元型数据，即模型的输入数据集
dishes <- as(detail[, c(8, 4)], "binaryRatingMatrix")
write.csv(as(dishes, "matrix"), "./tmp/dishes_matrix.csv")  # 导出二元型矩阵model.IBCF <- Recommender(dishes, method = "IBCF")  # 建模# 导出相似度矩阵
dishes.model.sim <- as(model.IBCF@model$sim, "matrix")
write.csv(dishes.model.sim, "./tmp/dishes_model_sim.csv")# 利用模型对原始数据集进行预测并获得推荐长度为30的结果
recommend.IBCF <- predict(model.IBCF, dishes, n = 30)  # 推荐列表
as(recommend.IBCF, "list")[1:5]  # 查看前五个用户的推荐# 将结果保存至工作目录下的文件中，需要将结果转换为list型。
# 对list型结果采用sink与print命令将其保存
sink("./tmp/recommend_IBCF.txt")
print(as(recommend.IBCF, "list"))
sink()# 基于用户的协同过滤
model.UBCF <- Recommender(dishes, method = "UBCF")  # 建模
# 利用模型对原始数据集进行预测并获得推荐长度为30的结果
recommend.UBCF <- predict(model.UBCF, dishes, n = 30)  # 推荐列表
as(recommend.UBCF, "list")[1:5]  # 查看前五个用户的推荐# 将结果保存至工作目录下的文件中，需要将结果转换为list型。
# 对list型结果采用sink与print命令将其保存
sink("./tmp/recommend_UBCF.txt")
print(as(recommend.UBCF, "list"))
sink()# 【离线评价两个推荐模型】
# 评价模型
algorithms <- list("ItemCF" = list(name = "IBCF", param = NULL),"UserCF" = list(name = "UBCF", param = NULL))
# 将数据以交叉检验划分成10份，9份训练，1份测试
dishes.es <- evaluationScheme(dishes, method = "cross-validation", k = 10, given = 1)
# 采用算法列表对数据进行模型预测与评价，其推荐值n取15, 20, 25, 30, 35
results <- evaluate(dishes.es, algorithms, n = c(15, 20, 25, 30, 35))# 画出评价结果的图形
plot(results, "prec/rec", legend = "topleft", cex = 0.67)# 构建F1的评价指标
fvalue <- function(p, r) {return(2 * p * r / (p + r))
}# 求两个模型的各个评价指标的均值，并将其转换为数据框的形式
library(plyr)
index <- ldply(avg(results))# 将指标第一列有关于模型的名字重新命名
index[, 1] <- paste(index[, 1], c(15, 20, 25, 30, 35))# 计算两个模型的F1的指标，并将所有指标综合
F1 <- fvalue(index[, 6], index[, 7])
dishes.Fvalue <- cbind(index, F1)
# 对评价指标值只保留3位小数
for (i in 2:ncol(dishes.Fvalue)) {dishes.Fvalue[, i] <- round(dishes.Fvalue[, i], 3)
}
write.csv(dishes.Fvalue, "./tmp/dishes_predict_index.csv", row.names = FALSE)

使用Apriori算法实现菜品的关联分析

# 【构建购物篮数据】
# 设置工作目录并读取数据
setwd()
info <- read.csv("./tmp/info_clear.csv", stringsAsFactors = FALSE)
detail <- read.csv("./tmp/detail_clear.csv", stringsAsFactors = FALSE)# 建立aliment列表,每个列表代表一个订单的菜品
order.id <- unique(detail$order_id)  # 对id去重
aliment <- list()
for(i in 1:length(order.id)){aliment[[i]] <- detail[which(detail$order_id == order.id[i]), 4]aliment[[i]] <- unique(aliment[[i]])  # 去掉出现重复的事务
}# 导出购物篮数据
require(plyr)
aliment1 <- ldply(aliment, rbind)  # 将列表转为数据框
row.names(aliment1) <- as.character(order.id)  # 修改数据框列名
write.csv(aliment1, "./tmp/aliment.csv")# 【构建二元矩阵和Apriori模型】
# 创建购物篮的二元矩阵
col <- levels(as.factor(unlist(aliment)))  # 提取aliment列表中每个菜的菜名
ruleData <- matrix(FALSE, length(aliment), length(col))  # 创建一个空的矩阵
colnames(ruleData) <- col  # 修改ruleData的列名
row.names(ruleData) <- as.character(order.id)  # 修改ruleData的行名
# 每一个订单中所包含的菜改为1
for(i in 1:length(aliment)){ruleData[i, match(aliment[[i]], col)] <- TRUE
}write.csv(ruleData, "./tmp/ruleData.csv", row.names = FALSE)  # 导出二元矩阵# 构建关联规则模型
library(arules)
# 把数据转换成关联规则需要的数据类型
trans <- as(aliment, "transactions")
# 或直接使用ruleData数据进行建模
# trans <- read.csv("./tmp/ruleData.csv", stringsAsFactors = FALSE)# 查看数据集前5行数据
inspect(trans[1:5])
# 生成关联规则
rules <- apriori(trans, parameter = list(support = 0.01, confidence = 0.5))
summary(rules)
inspect(sort(rules, by = list('support'))[1:10])  # 查看前10个支持度较高的规则# 绝对数量显示
itemFrequencyPlot(trans, type = 'absolute', topN = 10, horiz = T)# 查看前项为"芹菜炒腰花" 的规则
item <- subset(rules, subset = rhs %in% "芹菜炒腰花")
inspect(sort(item, by = "support"))# 导出规则数据
write(item, "./tmp/item.csv", sep = ",", row.names = FALSE)
write(rules, "./tmp/rules.csv", sep = ",", row.names = FALSE)# 【提取规则的前项和后项】
# 处理规则数据
result <- read.csv("./tmp/rules.csv", stringsAsFactors = FALSE)
# 将规则拆开
meal.recom <- strsplit(result$rules, "=>")# 去除中括号
lhs <- 0
rhs <- 0
for (i in 1:length(meal.recom)) {lhs[i] <- gsub("[{|}+\n]|\\s", "", meal.recom[[i]][1])rhs[i] <- gsub("[{|}+\n]|\\s", "", meal.recom[[i]][2])
}rules.new <- data.frame(lhs = lhs, rhs = rhs, support = result$support,confidence = result$confidence, lift = result$lift)write.csv(rules.new, "./tmp/rules_new.csv", row.names = FALSE)  # 写出数据# 【进行综合得分】
# 计算综合评分
# 读取数据
rules.new <- read.csv("./tmp/rules_new.csv", stringsAsFactors = FALSE)
sales_volume <- read.csv("./tmp/sales_volume.csv", stringsAsFactors = FALSE)
profit <- read.csv("./tmp/profit.csv", stringsAsFactors = FALSE)
dish <- read.csv("./data/meal_dishes_detail.csv", stringsAsFactors = FALSE)# 统计前项
rules.count <- as.data.frame(table(rules.new$lhs))
rules.count <- rules.count[order(rules.count$Freq, decreasing = TRUE), ]# 提取前项为“芹菜炒腰花,孜然羊排”的数据，对推荐的菜品进行综合评分
# 计算每个菜所推荐的菜的综合评分
# 设A的权重a1 = 1.5, a2 = 2.5, a3 = 2, a4 = 4
A <- matrix(c(0, 2.5, 2, 4, 1.5, 0, 2, 4,1.5, 2.5, 0, 4,1.5, 2.5, 2, 0), 4, 4, byrow = T)
E <- c(1, 1, 1, 1)# 初始化
rules.new$sales <- 0  # 热销度
rules.new$recommendation <- 0  # 主推度
rules.new$profit <- 0  # 毛利率
rules.new$mark <- 0  # 综合评分for (i in 1:nrow(rules.new)) {# 找到对应的热销度sales.num <- which(sales_volume$dishes_name == rules.new$rhs[i])rules.new$sales[i] <- sales_volume$sales_hot[sales.num]# 找到对应的毛利率和主推度profit.num <- which(profit$dishes_name == rules.new$rhs[i])rules.new$profit[i] <- profit$rate[profit.num]rules.new$recommendation[i] <- profit$recommend_percent[profit.num]# 计算综合评分Y <- c(rules.new$sales[i], rules.new$recommendation[i], rules.new$profit[i], rules.new$confidence[i])rules.new$mark[i] <- round((E - Y) %*% A %*% t(t(Y)), 3)
}# 对综合评分进行排序
rules.new <- rules.new[order(rules.new$mark, decreasing = TRUE), ]write.csv(rules.new, "./tmp/recommend.csv", row.names = FALSE)  # 写出数据# 选取后项为"芹菜炒腰花" 的数据
rules.item <- rules.new[which(rules.new$rhs == "芹菜炒腰花"), ]
write.csv(rules.item, "./tmp/rules_item.csv", row.names = FALSE)

使用K-means算法进行客户价值分析

# 【数据预处理】
# 设置工作目录并读取数据
setwd()
info <- read.csv("./data/meal_order_info.csv", stringsAsFactors = FALSE)
users <- read.csv("./data/users.csv", stringsAsFactors = FALSE)# 数据预处理
info <- info[which(info$order_status == 1), ]  # 提取有效订单# 对info的时间列按用户的ID去重
info_time <- info[, c("emp_id", "use_start_time")]
library(plyr)
info_time <- ddply(info_time, .(emp_id), tail, n = 1)# 匹配用户的最后一次用餐时间
for (i in 1:nrow(info)) {num <- which(users$USER_ID == info$emp_id[i])users[num, "LAST_VISITS"] <- info$use_start_time[i]
}user <- users[-which(users$LAST_VISITS == ""), c(1, 3, 13, 15)]  # 特征选取# 构建RFM特征
# 构建F特征
user.value1 <- as.data.frame(table(info$emp_id))  # 统计每个人的用餐次数
colnames(user.value1) <- c("USER_ID", "F")  # 修改列名# 构建M特征
user.value2 <- aggregate(info[, "expenditure"], list(info$emp_id), FUN = 'sum')
colnames(user.value2) <- c("USER_ID", "M")
user.value <- merge(user.value1, user.value2, by = c("USER_ID"))  # 合并两个表# 构建R特征
user.value <- merge(user.value, user, by = c("USER_ID"))  # 合并两个表
# 转换时间格式
last_time <- as.Date(user.value$LAST_VISITS, "%Y/%m/%d")
finally <- as.Date("2016-8-31")  # 观测窗口结束时间user.value$R <- as.numeric(difftime(finally, last_time, units = "days"))user.value <- user.value[, c(1,4,7,2,3)]  # 特征提取
write.csv(user.value, "./tmp/user_value.csv", row.names = FALSE)# 【构建模型】
# 确定类数
user.value <- read.csv("./tmp/user_value.csv", stringsAsFactors = FALSE)
USER_ID <- user.value$USER_ID
ACCOUNT <- user.value$ACCOUNT
user.value <- user.value[, -c(1,2)]# 标准化数据
standard <- scale(user.value)  # 数据标准化
write.csv(standard, './tmp/standard.csv', row.names = FALSE)  # 写出数据# 求组间距离平方和与总体距离平方和的比值（betweenss / totss，越接近1越好）
BT <- 0
for(i in 1:10){model <- kmeans(user.value, centers = i)BT[i] <- model$betweenss / model$totss
}
plot(1:10, BT, type = "b", xlab = "聚类数", ylab = "组间平方和/总体距离平方和")# 构建模型
set.seed(123)
result <- kmeans(standard, 3)result$center  # 查看聚类中心值
result$size  # 查看每一类的个数# 导出聚类后的数据
users.class <- cbind(USER_ID, ACCOUNT, user.value, class = result$cluster)
write.csv(users.class, "./tmp/users_class.csv", row.names = FALSE)# 每一簇各指标的关系程度--雷达图
library(fmsb)
max <- apply(result$centers, 2, max)
min <- apply(result$centers, 2, min)
radar <- data.frame(rbind(max, min, result$centers))
radarchart(radar, pty = 32, plty = c(1:3), plwd = 4, vlcex = 1.2)  # 画雷达图
# 给雷达图加图例
L <- 1.2
for(i in 1:3){legend(1.0, L, legend = paste("客户群", i), lty = i, lwd = 3, col = i, bty = "n")L <- L - 0.3
}

用决策树算法实现餐饮客户流失预测

# 【合并客户信息表和订单表】
# 设置工作目录
setwd()
# 合并两个表
# 读取数据
users <- read.csv("./data/user_loss.csv", stringsAsFactors = FALSE)
info <- read.csv("./data/info_new.csv", stringsAsFactors = FALSE)# 将时间转为时间格式
library(lubridate)
info$use_start_time <- parse_date_time(info$use_start_time, orders = "YmdHMS")
# info$lock_time <- parse_date_time(info$lock_time, orders = "YmdHMS")
# users$CREATED <- parse_date_time(users$CREATED, orders = "YmdHMS")# 对info的时间列按用户的ID去重
info_time <- info[, c("emp_id", "use_start_time")]
library(plyr)
info_time <- ddply(info_time, .(emp_id), tail, n = 1)# 匹配用户的最后一次用餐时间
for (i in 1:nrow(users)) {info1 <- info[which(info$name == users$ACCOUNT[i]), ]  # 提取某用户的订单数据if(nrow(info1) >= 1){info1 <- info1[order(info1$use_start_time), ]users[i, "LAST_VISITS"] <- as.character(info1$use_start_time[nrow(info1)])}
}# 特征选取
user <- users[, c(1, 3, 15, 38)]
info <- info[which(info$order_status == 1), c(2,3,7)]  # 提取有效订单names(info)[1] <- "USER_ID"  # 修改列名info.user <- merge(user, info, by = "USER_ID")  # 合并两个表
write.csv(info.user, "./tmp/info_user.csv", row.names = FALSE)# 【构建特征】
info.user <- read.csv("./tmp/info_user.csv", stringsAsFactors = FALSE)# 提取info表的用户名和用餐时间，并按人名对用餐人数和金额进行分组求和
info.user1 <- as.data.frame(table(info.user$USER_ID))  # 统计每个人的用餐次数
colnames(info.user1) <- c("USER_ID", "frequence")  # 修改列名# 求出每个人的消费总金额
info.user2 <- aggregate(info.user[, c(("number_consumers"), ("expenditure"))], list(info.user$USER_ID), FUN = "sum")  # 分组求和
colnames(info.user2) <- c("USER_ID", "numbers", "amount")
info.user.new <- merge(info.user1, info.user2, by = c("USER_ID"))  # 合并两个表# 对合并后的数据进行处理
info.user <- info.user[, c(1:4)]
library(plyr)
info.user <- ddply(info.user, .(USER_ID, LAST_VISITS, type), tail, n = 1)info.user.new <- merge(info.user.new, info.user, by = "USER_ID")  # 合并两个表# 求平均消费金额，并保留2为小数
info.user.new$average <- round(info.user.new$amount / info.user.new$numbers, 2)
# 计算每个客户最近一次点餐的时间距离观测窗口结束的天数
# 修改时间列，改为日期
info.user.new$LAST_VISITS <- as.Date(info.user.new$LAST_VISITS)datefinally <- as.Date("2016-7-31")  # 观测窗口结束时间info.user.new$recently  <- difftime(datefinally, info.user.new$LAST_VISITS, units = "days")  # 计算时间差
info.user.new$recently  <- as.numeric(info.user.new$recently )  # 转为数值型info.user.new <- info.user.new[, c(1,5,2,4,8,9,7)]  # 特征选取
write.csv(info.user.new, "./tmp/info_user_clear.csv", row.names = FALSE)# 【构建决策树模型并评价】
# 划分测试集、训练集
info.user <- read.csv("./tmp/info_user_clear.csv", stringsAsFactors = FALSE)# 删除流失用户
info.user <- info.user[-which(info.user$type == "已流失"), ]model.data <- info.user[, c(3:7)]
model.data$type <- as.factor(model.data$type)  # 将类标号转换为因子型set.seed(12345)  # 设置随机种子
ind <- sample(2, nrow(model.data), replace = TRUE, prob = c(0.8, 0.2))
train <- model.data[ind == 1, ]
test <- model.data[ind == 2, ]# 查看样本分布
table(train$type)
table(test$type)# 构建决策树模型
library(rpart)
# method="class"为分类树；
# parms = list(split = "information")为选择信息熵计算纯度；
# xval设置交叉验证次数；minsplit设置最小分割；cp设置剪枝率。
rpart <- rpart(type ~ ., train, method = "class",parms = list(split = "information"),control=rpart.control(xval = 10, minsplit = 20, cp = 0.01))# 画树
library(rattle)
fancyRpartPlot(rpart, cex = 0.7)  # 画决策图彩色图
asRules(rpart)  # 导出决策规则#  测试集预测
pre.rpart <- predict(rpart, test, type = "class")  # 预测
table.pre <- table(test$type, pre.rpart)  # 混淆矩阵(P <- table.pre[2, 2] / sum(table.pre[2, ]))  # 精确率
(R <- table.pre[2, 2] / sum(table.pre[, 2]))  # 召回率
(F1 <- 2 * P * R / (P + R))

看书标记【R语言商务数据分析实战9】相关推荐

看书标记【R语言商务数据分析实战4】
看书标记--关于R语言 chapter 4 4.2 任务实现 [R语言商务数据分析实战4] chapter 4 财政收入预测分析从海量数据中发现隐藏的运行模式,并提供具有决策意义的信息.变量过多会 ...
看书标记【R语言商务数据分析实战5】
看书标记--关于R语言 chapter 5 5.2 任务实现 [R语言商务数据分析实战5] chapter 5 金融服务机构资金流量预测数据理解和预处理>>检验平稳性+纯随机性> ...
看书标记【R语言商务数据分析实战6】
看书标记--关于R语言 chapter 6 6.2 任务实 [R语言商务数据分析实战6] chapter 6 P2P信用贷款风险控制(用户逾期还款概率模型) 关于数据库的应用+数据清洗+实时数据识别 ...
【R语言与数据分析实战】数据操作（一）：基于向量的处理与外部数据处理
目录 1.R中的常用数据集 2.读写CSV文件 (1) 读写CSV文件 (2) 读写对象文件 3.合并数据框的行与列 4.apply系数函数 (1) apply (2) lapply (3) sapp ...
【R语言与数据分析实战】R软件编程
目录 1.流程控制 (1) if语句 (2) 循环语句 2.运算 (1) 数值运算 (2) 向量运算 (3) NA处理 3.定义函数 (1) 可变长函数 (2) 嵌套函数 4.作用域 5.对象的不变性 ...
【R语言与数据分析实战】绘图
目录 1.散点图 2.图像选项 2.1 坐标轴名称 2.2 图形标题 2.3 点的类型 2.4 点的大小 2.5 颜色 2.6 坐标轴的取值范围 2.7 图像类型 2.8 线型 2.9 图形排列 2. ...
R语言医学数据分析实战（三）数据可视化
文章目录一.用R的基础绘图系统作图 1.函数plot() 2.直方图和密度曲线图 3.条形图 4.饼图 5.箱线图和小提琴图 6.克里夫兰点图二.用ggplot2包作图 1.初识ggplot2包 ...
R语言与数据分析实战1-基础
第1关:R语言的数据类型本关是我们的第一关,任务非常简单,只要搞清楚我们面对的数据类型是什么,并能实现数据类型之间的转换,就可以过关啦! # 将 "3.14" 赋值给变量 pi, ...
R语言与数据分析实战11-数据的删除
第1关:数据的删除在实际的数据分析任务中,我们所拿到的数据往往存在着各种瑕疵,比如大量的缺失值.野值.无效观测等,因此我们除了按照前序课程讲的那样提取出有用数据外,还可以将无效或者错误的数据直接进行 ...

看书标记【R语言商务数据分析实战9】

看书标记——关于R语言

chapter 9

9.2 任务实现

看书标记【R语言商务数据分析实战9】相关推荐

最新文章

热门文章

看书标记【R语言 商务数据分析实战9】

看书标记——关于R语言

chapter 9

9.2 任务实现

看书标记【R语言 商务数据分析实战9】相关推荐

最新文章

热门文章

看书标记【R语言商务数据分析实战9】

看书标记【R语言商务数据分析实战9】相关推荐