dplyr包功能（数据清理、过滤、合并R实现）

去除重复项

选取随机样本

变量重命名

select()函数

filter()函数

summarise()函数

arrange()函数

group_by() 函数

mutate()函数

join()函数

R软件包dplyr用于数据清理，处理，可视化和分析，包含了很多有用的功能，与ggplot2,reshape2并列为数据分析及可视化的三大包之一。

select()
filter()
mutate()
group_by()
summarise()
arrange()
join()

示例数据

require(dplyr)# Data file
file <- "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"# Some sensible variable names
df_names <- c("age", "wrkclass", "fnlweight", "education_lvl", "edu_score","marital_status", "occupation", "relationship", "ethnic", "gender","cap_gain", "cap_loss", "hrs_wk", "nationality", "income")# Import the data
df <- read.csv(file, header = F,sep = ",",na.strings = c(" ?", " ", ""),row.names = NULL,col.names = df_names)

dplyr中的许多数据操作任务都可以在前向管道运算符（％>％）的帮助下执行。该管道最初是在Magrittr软件包中引入的，此后已包含在dplyr软件包中。对于流体数据的操作，它是一个非常有用的工具，可产生高度可读的代码。普查数据集需要一些预处理才能准备好用于分类算法。这篇文章不涉及预处理，也不包括预测建模。

去除重复项

#去除重复行
df %>% distinct() %>% nrow()# 删除重复的行并分配给新的dataframe对象
df_clean <- df %>% distinct()# 根据一个或多个变量删除重复项
df %>% distinct(gender, .keep_all = T)
df %>% distinct(gender, education_lvl, .keep_all =  T)

选取随机样本

# 抽样随机行，替换或不替换
sample_n(df, size = nrow(df) * 0.7, replace = F)
sample_n(df, size = 20, replace = T)# 抽样一定比例的行，有无替换
sample_frac(df, size = 0.7, replace = F)
sample_frac(df, size = 0.8, replace = T

变量重命名

# Rename one or more variables in a dataframe
df <- df %>%rename("INCOME" = "income")df <- df %>%rename("INCOME" = "income", "AGE" = "age")

select()函数

# 选择特定的列（INCOME是先前的新名称）
df %>%select(education_lvl, INCOME)# 在dplyr 0.7.0中，pull（）函数将变量提取为向量
df %>%pull(age)# 使用-运算符删除列（可通过名称或列位置引用变量）
df %>%select(-edu_score)df %>%select(-1, -4)df %>%select(-c(2:6))

有些功能可与select函数结合，下面为一些示例

# 选择名称以“ e”开头的列
df %>%select(starts_with("e"))# -号也适用
df %>%select(-starts_with("e"))# 选取包含特定名称的列
df %>%select(contains("edu"))# 按照某一列重排序
df %>%select(INCOME, everything())# 选取以某名称结尾的列
df %>%select(ends_with("e"))df %>%select(ends_with("_loss"))

filter()函数

# 过滤行以保留年龄大于30的观察值
df %>%filter(age > 30)# 使用％in％运算符按多个条件过滤（确保字符串匹配）
df %>%filter(relationship %in% c(" Unmarried", " Wife"))# 可以使用OR 运算符 (|)
df %>%filter(relationship == " Husband" | relationship == " Wife")# 适用 AND 运算符
df %>%filter(age > 30 & INCOME == " >50K")# 两者结合
df %>%filter(education_lvl %in% c(" Doctorate", " Masters") & age > 30)# 非
df %>%filter(education_lvl != " Doctorate")# grepl（）函数可以与filter（）一起使用
df %>%filter(grepl(" Wi", relationship))

summarise()函数

动态汇总数据组甚至管道组，以进行ggplot数据可视化。

# 汇总分组数据
df %>%filter(INCOME == " >50K") %>%summarise(mean_age = mean(age),median_age = median(age),sd_age = sd(age))# 汇总多个变量使用summarise_at()
df %>%filter(INCOME == " >50K") %>%summarise_at(vars(age, hrs_wk),funs(n(), mean, median))#  . 代表所有变量
df %>%summarise_at(vars(age, hrs_wk),funs(n(),missing = sum(is.na(.)),mean = mean(., na.rm = T)))# 使用匿名函数创建新的摘要统计信息
df %>%summarise_at(vars(age),function(x) { sum((x - mean(x)) / sd(x)) })# 使用summarise_if（）有条件地进行汇总
df %>%filter(INCOME == " >50K") %>%summarise_if(is.numeric,funs(n(),mean,median))# 选择数值型变量，并使用summarise_all（）获取摘要统计信息
ints <- df[sapply(df, is.numeric)]
summarise_all(ints,funs(mean, median, sd, var))

arrange()函数

升序或降序排列（默认升序）

# 按年龄升序
df %>%arrange(age) %>%head(10)# 按年龄降序
df %>%arrange(desc(age)) %>%head(10)

group_by() 函数

#适用于常规数据分析
df %>%group_by(gender) %>%summarise(Mean = mean(age))df %>%group_by(relationship) %>%summarise(total = n())df %>%group_by(relationship) %>%summarise(total = n(),mean_age = mean(age))

mutate()函数

mutate（）用于从现有的局部变量或全局对象创建新变量。也可以在mutate（）中指定新变量，例如序列。

# 从已知变量中构建新变量
df %>% mutate(norm_age = (age - mean(age)) / sd(age))# 将每个数字元素乘以1000（将名称“ new”添加到原始变量名称中）
df %>%mutate_if(is.numeric,funs(new = (. * 1000))) %>%head()

join()函数

join（）用于合并来自共同ID或其他一些常见变量的不相交表的行。 join有很多变体，通常使用的是 left, right, inner and full joins.

# 构建用于键值的ID
df <- df %>%mutate(ID = seq(1:nrow(df))) %>%select(ID, everything())# 生成两个表(部分重叠)
table_1 <- df[1:50 , ] %>%select(ID, age, education_lvl)table_2 <- df[26:75 , ] %>%select(ID, gender, INCOME)# left join()将表2中的行联接到表1中（方向在参数顺序中是隐式的）
left_join(table_1, table_2, by = "ID")# Right join 将表2中的行联接到表1中
right_join(table_1, table_2, by = "ID")# inner join仅保留完整case
inner_join(table_1, table_2, by = "ID")# Full join 联接并保留所有的case
full_join(table_1, table_2, by = "ID"

以上总结了dplyr的一些出色功能。有关函数及其参数的更多信息，请使用模板查看帮助文档：

References

Hadley Wickham, Romain Francois, Lionel Henry and Kirill Müller (2017). dplyr: A
Grammar of Data Manipulation. R package version 0.7.0.
https://CRAN.R-project.org/package=dplyr

H. Wickham. ggplot2: Elegant Graphics for Data Analysis. Springer-Verlag New
York, 2009.