##概述 dplyr下篇
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
按照行的形式,对行进行操作
# 按照行实现两列的行求和
iris[,1:4] %>%
rowwise() %>%
mutate(total = sum(c(Sepal.Length, Sepal.Width))) %>% head()
## # A tibble: 6 x 5
## # Rowwise:
## Sepal.Length Sepal.Width Petal.Length Petal.Width total
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2 8.6
## 2 4.9 3 1.4 0.2 7.9
## 3 4.7 3.2 1.3 0.2 7.9
## 4 4.6 3.1 1.5 0.2 7.7
## 5 5 3.6 1.4 0.2 8.6
## 6 5.4 3.9 1.7 0.4 9.3
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.
# 从iris中选择数字列,并进行计算
iris %>%
rowwise() %>%
mutate(total = sum(c_across(where(is.numeric)))) %>% head()
## # A tibble: 6 x 6
## # Rowwise:
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species total
## <dbl> <dbl> <dbl> <dbl> <fct> <dbl>
## 1 5.1 3.5 1.4 0.2 setosa 10.2
## 2 4.9 3 1.4 0.2 setosa 9.5
## 3 4.7 3.2 1.3 0.2 setosa 9.4
## 4 4.6 3.1 1.5 0.2 setosa 9.4
## 5 5 3.6 1.4 0.2 setosa 10.2
## 6 5.4 3.9 1.7 0.4 setosa 11.4
当然上述的方式通过r语言的一些简单操作也可以实现,但是dplyr可以实现的不止是求和的功能
# 为iris建立id
iris$id <- rownames(iris)
# 按照id进行合并求和
iris %>%
rowwise(id) %>%
summarise(total = sum(c_across(where(is.numeric)))) %>% head()
## `summarise()` regrouping output by 'id' (override with `.groups` argument)
## # A tibble: 6 x 2
## # Groups: id [6]
## id total
## <chr> <dbl>
## 1 1 10.2
## 2 2 9.5
## 3 3 9.4
## 4 4 9.4
## 5 5 10.2
## 6 6 11.4
如果我们想使用不同种类的鸢尾花进行建模,批量处理,那么就可使用nest_by()做嵌套处理
by_species <- iris %>%
nest_by(Species)
head(by_species)
## # A tibble: 3 x 2
## # Rowwise: Species
## Species data
## <fct> <list<tbl_df[,5]>>
## 1 setosa [50 x 5]
## 2 versicolor [50 x 5]
## 3 virginica [50 x 5]
# 按照rowwise的逻辑,按照这三行进行分别建模
by_species = by_species %>%
mutate(model = list(lm(Sepal.Width ~ Sepal.Length, data = data)),
pred = list(predict(model, data)))
by_species
## # A tibble: 3 x 4
## # Rowwise: Species
## Species data model pred
## <fct> <list<tbl_df[,5]>> <list> <list>
## 1 setosa [50 x 5] <lm> <dbl [50]>
## 2 versicolor [50 x 5] <lm> <dbl [50]>
## 3 virginica [50 x 5] <lm> <dbl [50]>
# 结果为包含每个鸢尾花种类的数据,模型和预测值
# 这里的by_species为上述代码执行后产生的数据
# 对于这些统计量的计算,建议查看统计相关教材
by_species = by_species %>%
mutate(rmse = sqrt(mean((pred - data$Sepal.Width) ^ 2)),
rsq = summary(model)$r.squared,
slope = summary(model)$coefficients[2])
by_species
## # A tibble: 3 x 7
## # Rowwise: Species
## Species data model pred rmse rsq slope
## <fct> <list<tbl_df[,5]>> <list> <list> <dbl> <dbl> <dbl>
## 1 setosa [50 x 5] <lm> <dbl [50]> 0.251 0.551 0.799
## 2 versicolor [50 x 5] <lm> <dbl [50]> 0.264 0.277 0.320
## 3 virginica [50 x 5] <lm> <dbl [50]> 0.284 0.209 0.232
# 现在模型建好了,另外还有诊断信息,包括bic,p值等统计量的提取
# 使用summarise函数
library(broom)
by_species %>%
summarise(glance(model))
## `summarise()` regrouping output by 'Species' (override with `.groups` argument)
## # A tibble: 3 x 12
## # Groups: Species [3]
## Species r.squared adj.r.squared sigma statistic p.value df logLik AIC
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
## 1 setosa 0.551 0.542 0.257 59.0 6.71e-10 2 -1.90 9.80
## 2 versic~ 0.277 0.262 0.270 18.4 8.77e- 5 2 -4.40 14.8
## 3 virgin~ 0.209 0.193 0.290 12.7 8.43e- 4 2 -8.00 22.0
## # ... with 3 more variables: BIC <dbl>, deviance <dbl>, df.residual <int>
# 提取模型系数
by_species %>%
summarise(tidy(model))
## `summarise()` regrouping output by 'Species' (override with `.groups` argument)
## # A tibble: 6 x 6
## # Groups: Species [3]
## Species term estimate std.error statistic p.value
## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa (Intercept) -0.569 0.522 -1.09 2.81e- 1
## 2 setosa Sepal.Length 0.799 0.104 7.68 6.71e-10
## 3 versicolor (Intercept) 0.872 0.445 1.96 5.56e- 2
## 4 versicolor Sepal.Length 0.320 0.0746 4.28 8.77e- 5
## 5 virginica (Intercept) 1.45 0.431 3.36 1.55e- 3
## 6 virginica Sepal.Length 0.232 0.0651 3.56 8.43e- 4
# 结果为按照鸢尾花种类给出模型系数
# 提取样本预测值,残差
by_species %>%
summarise(augment(model)) %>% head()
## `summarise()` regrouping output by 'Species' (override with `.groups` argument)
## # A tibble: 6 x 10
## # Groups: Species [1]
## Species Sepal.Width Sepal.Length .fitted .se.fit .resid .hat .sigma
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 3.5 5.1 3.50 0.0376 -0.00306 0.0215 0.259
## 2 setosa 3 4.9 3.34 0.0379 -0.343 0.0218 0.254
## 3 setosa 3.2 4.7 3.18 0.0483 0.0163 0.0354 0.259
## 4 setosa 3.1 4.6 3.10 0.0557 -0.00380 0.0471 0.259
## 5 setosa 3.6 5 3.42 0.0363 0.177 0.0200 0.258
## 6 setosa 3.9 5.4 3.74 0.0547 0.157 0.0455 0.258
## # ... with 2 more variables: .cooksd <dbl>, .std.resid <dbl>
# slice让我想起的python里的操作
# slice(df, 3:7) # 选择3-7行
# lice_head(df, n, prop) # 从前面开始选择若干行
# slice_tail(df, n, prop) # 从后面开始选择若干行
# slice_min(df, order_by, n, prop) # 根据order_by选择最小的若干行
# slice_max(df, order_by, n, prop) # 根据order_by选择最大的若干行
# slice_sample(df, n, prop) # 随机选择若干行
# 随机选取最大的Sepal.Length 5个
iris %>%
slice_max(Sepal.Length, n = 5)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species id
## 1 7.9 3.8 6.4 2.0 virginica 132
## 2 7.7 3.8 6.7 2.2 virginica 118
## 3 7.7 2.6 6.9 2.3 virginica 119
## 4 7.7 2.8 6.7 2.0 virginica 123
## 5 7.7 3.0 6.1 2.3 virginica 136
# 将数字放在Species之后
iris %>%
relocate(where(is.numeric), .after = Species) %>% head()
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width id
## 1 setosa 5.1 3.5 1.4 0.2 1
## 2 setosa 4.9 3.0 1.4 0.2 2
## 3 setosa 4.7 3.2 1.3 0.2 3
## 4 setosa 4.6 3.1 1.5 0.2 4
## 5 setosa 5.0 3.6 1.4 0.2 5
## 6 setosa 5.4 3.9 1.7 0.4 6
# 求百分位数,最终的结果为一个长数据的格式
iris %>%
group_by(Species) %>%
summarise(Sepal.Length_qs = quantile( Sepal.Length, c(0.25, 0.5, 0.75)), q =c( 0.25, 0.5, 0.75)) %>% head()
## `summarise()` regrouping output by 'Species' (override with `.groups` argument)
## # A tibble: 6 x 3
## # Groups: Species [2]
## Species Sepal.Length_qs q
## <fct> <dbl> <dbl>
## 1 setosa 4.8 0.25
## 2 setosa 5 0.5
## 3 setosa 5.2 0.75
## 4 versicolor 5.6 0.25
## 5 versicolor 5.9 0.5
## 6 versicolor 6.3 0.75
关于dplyr的新版本更新完毕,总体来说新版本还是有些都东西的,后续处理数据的时候,应该是能用的到的。seeyou, love & peace.