今天的我们继续来看 dplyr 的 arrange 和 mutate。
我们想按 sleep_total 降序排列行,可以使用 desc 函数;
msleep %>% arrange(desc(sleep_total)) %>% glimpse()
Observations: 83
Variables: 11
$ name <chr> "Little brown bat", "Big brown bat", "Thick-tailed opposu…
$ genus <chr> "Myotis", "Eptesicus", "Lutreolina", "Priodontes", "Didel…
$ vore <chr> "insecti", "insecti", "carni", "insecti", "omni", "carni"…
$ order <chr> "Chiroptera", "Chiroptera", "Didelphimorphia", "Cingulata…
$ conservation <chr> NA, "lc", "lc", "en", "lc", "lc", NA, "lc", "lc", "en", N…
$ sleep_total <dbl> 19.9, 19.7, 19.4, 18.1, 18.0, 17.4, 17.0, 16.6, 15.9, 15.…
$ sleep_rem <dbl> 2.0, 3.9, 6.6, 6.1, 4.9, 3.1, 1.8, NA, 3.0, NA, NA, 2.3, …
$ sleep_cycle <dbl> 0.2000000, 0.1166667, NA, NA, 0.3333333, 0.3833333, NA, N…
$ awake <dbl> 4.1, 4.3, 4.6, 5.9, 6.0, 6.6, 7.0, 7.4, 8.1, 8.2, 8.2, 8.…
$ brainwt <dbl> 0.00025, 0.00030, NA, 0.08100, 0.00630, 0.01080, 0.01550,…
$ bodywt <dbl> 0.010, 0.023, 0.370, 60.000, 1.700, 3.500, 0.480, 0.920, …
如果想按多个变量进行排列,先 sleep_total 再 sleep_rem ;
msleep %>% arrange(desc(sleep_total),sleep_rem) %>% glimpse()
Observations: 83
Variables: 11
$ name <chr> "Little brown bat", "Big brown bat", "Thick-tailed opposu…
$ genus <chr> "Myotis", "Eptesicus", "Lutreolina", "Priodontes", "Didel…
$ vore <chr> "insecti", "insecti", "carni", "insecti", "omni", "carni"…
$ order <chr> "Chiroptera", "Chiroptera", "Didelphimorphia", "Cingulata…
$ conservation <chr> NA, "lc", "lc", "en", "lc", "lc", NA, "lc", "lc", "en", N…
$ sleep_total <dbl> 19.9, 19.7, 19.4, 18.1, 18.0, 17.4, 17.0, 16.6, 15.9, 15.…
$ sleep_rem <dbl> 2.0, 3.9, 6.6, 6.1, 4.9, 3.1, 1.8, NA, 3.0, NA, NA, 2.3, …
$ sleep_cycle <dbl> 0.2000000, 0.1166667, NA, NA, 0.3333333, 0.3833333, NA, N…
$ awake <dbl> 4.1, 4.3, 4.6, 5.9, 6.0, 6.6, 7.0, 7.4, 8.1, 8.2, 8.2, 8.…
$ brainwt <dbl> 0.00025, 0.00030, NA, 0.08100, 0.00630, 0.01080, 0.01550,…
$ bodywt <dbl> 0.010, 0.023, 0.370, 60.000, 1.700, 3.500, 0.480, 0.920, …
当有缺失值时,我们看看排序的结果,NA 都被排在了最后;
msleep %>% arrange(sleep_rem) %>% tail %>% glimpse()
Observations: 6
Variables: 11
$ name <chr> "African striped mouse", "Arctic ground squirrel", "Short…
$ genus <chr> "Rhabdomys", "Spermophilus", "Tachyglossus", "Tamias", "T…
$ vore <chr> "omni", "herbi", "insecti", "herbi", "carni", "carni"
$ order <chr> "Rodentia", "Rodentia", "Monotremata", "Rodentia", "Cetac…
$ conservation <chr> NA, "lc", NA, NA, NA, NA
$ sleep_total <dbl> 8.7, 16.6, 8.6, 15.8, 5.2, 12.5
$ sleep_rem <dbl> NA, NA, NA, NA, NA, NA
$ sleep_cycle <dbl> NA, NA, NA, NA, NA, NA
$ awake <dbl> 15.3, 7.4, 15.4, 8.2, 18.8, 11.5
$ brainwt <dbl> NA, 0.0057, 0.0250, NA, NA, 0.0445
$ bodywt <dbl> 0.044, 0.920, 4.500, 0.112, 173.330, 3.380
如果我们想缺失值排在前面,可以按如下操作;
msleep %>% arrange(desc(is.na(sleep_rem))) %>% glimpse()
Observations: 83
Variables: 11
$ name <chr> "Cheetah", "Vesper mouse", "Roe deer", "Asian elephant", …
$ genus <chr> "Acinonyx", "Calomys", "Capreolus", "Elephas", "Eutamias"…
$ vore <chr> "carni", NA, "herbi", "herbi", "herbi", "herbi", "herbi",…
$ order <chr> "Carnivora", "Rodentia", "Artiodactyla", "Proboscidea", "…
$ conservation <chr> "lc", NA, "lc", "en", NA, "vu", NA, "nt", NA, "lc", "en",…
$ sleep_total <dbl> 12.1, 7.0, 3.0, 3.9, 14.9, 3.3, 12.8, 14.6, 11.0, 14.5, 1…
$ sleep_rem <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ sleep_cycle <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ awake <dbl> 11.90, 17.00, 21.00, 20.10, 9.10, 20.70, 11.20, 9.40, 13.…
$ brainwt <dbl> NA, NA, 0.09820, 4.60300, NA, 5.71200, NA, NA, 0.01250, N…
$ bodywt <dbl> 50.000, 0.045, 14.800, 2547.000, 0.071, 6654.000, 0.035, …
有时我们需要创建新变量,例如我们新建一个列 newcol 值为 sleep_total-1 ;
msleep %>% mutate(newcol=sleep_total-1) %>%
+ select(newcol,sleep_total, everything()) %>% glimpse()
Observations: 83
Variables: 12
$ newcol <dbl> 11.1, 16.0, 13.4, 13.9, 3.0, 13.4, 7.7, 6.0, 9.1, 2.0, 4.…
$ sleep_total <dbl> 12.1, 17.0, 14.4, 14.9, 4.0, 14.4, 8.7, 7.0, 10.1, 3.0, 5…
$ name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Greater shor…
$ genus <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos", "Bra…
$ vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "carn…
$ order <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha", "Art…
$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", NA, "dome…
$ sleep_rem <dbl> NA, 1.8, 2.4, 2.3, 0.7, 2.2, 1.4, NA, 2.9, NA, 0.6, 0.8, …
$ sleep_cycle <dbl> NA, NA, NA, 0.1333333, 0.6666667, 0.7666667, 0.3833333, N…
$ awake <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9, 21.0, 1…
$ brainwt <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.07000, 0…
$ bodywt <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.490, 0.04…
如果我们只想保留新构造的列 newcol,可以用 transmute 方法;
msleep %>%transmute(newcol=sleep_total-1) %>%
+ select(newcol, everything()) %>% glimpse()
Observations: 83
Variables: 1
$ newcol <dbl> 11.1, 16.0, 13.4, 13.9, 3.0, 13.4, 7.7, 6.0, 9.1, 2.0, 4.3, 8.4…
如果我们想对所有变量都进行相同的变换,mutate_all 函数就派上用场了;
msleep %>%
+ mutate_all(tolower) %>% glimpse()
Observations: 83
Variables: 11
$ name <chr> "cheetah", "owl monkey", "mountain beaver", "greater shor…
$ genus <chr> "acinonyx", "aotus", "aplodontia", "blarina", "bos", "bra…
$ vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "carn…
$ order <chr> "carnivora", "primates", "rodentia", "soricomorpha", "art…
$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", NA, "dome…
$ sleep_total <chr> "12.1", "17", "14.4", "14.9", "4", "14.4", "8.7", "7", "1…
$ sleep_rem <chr> NA, "1.8", "2.4", "2.3", "0.7", "2.2", "1.4", NA, "2.9", …
$ sleep_cycle <chr> NA, NA, NA, "0.133333333", "0.666666667", "0.766666667", …
$ awake <chr> "11.9", "7", "9.6", "9.1", "20", "9.6", "15.3", "17", "13…
$ brainwt <chr> NA, "0.0155", NA, "0.00029", "0.423", NA, NA, NA, "0.07",…
$ bodywt <chr> "50", "0.48", "1.35", "0.019", "600", "3.85", "20.49", "0…
>
如果我们的数据都是数值型或者都是字符型,我们可以用 mutate_all 批量处理,当两者都有的时候,就不太好用了 这时就该用 mutate_if 了。
直接上 mutate_all 的话就报错了,因为我们有多种类型;
msleep %>%
+ mutate_all(round) %>% glimpse()
Error in .Primitive("round")(name) :
non-numeric argument to mathematical function
我们可以用 mutate_if 加一个判断条件,若是数值型就把它四舍五入;
msleep %>%
+ mutate_if(is.numeric, round) %>% glimpse()
Observations: 83
Variables: 11
$ name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Greater shor…
$ genus <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos", "Bra…
$ vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "carn…
$ order <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha", "Art…
$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", NA, "dome…
$ sleep_total <dbl> 12, 17, 14, 15, 4, 14, 9, 7, 10, 3, 5, 9, 10, 12, 10, 8, …
$ sleep_rem <dbl> NA, 2, 2, 2, 1, 2, 1, NA, 3, NA, 1, 1, 1, 2, 2, 2, 1, 3, …
$ sleep_cycle <dbl> NA, NA, NA, 0, 1, 1, 0, NA, 0, NA, NA, 0, NA, 0, NA, NA, …
$ awake <dbl> 12, 7, 10, 9, 20, 10, 15, 17, 14, 21, 19, 15, 14, 12, 14,…
$ brainwt <dbl> NA, 0, NA, 0, 0, NA, NA, NA, 0, 0, 0, 0, NA, 0, 0, 0, 0, …
$ bodywt <dbl> 50, 0, 1, 0, 600, 4, 20, 0, 14, 15, 34, 1, 5, 0, 0, 1, 0,…
如果我们想要针对列名而不是数据进行变换,可以使用 mutate_at,比如我们想把包含 sleep 的列都乘以60,即用 分钟来表示时长;
msleep %>%
+ mutate_at(vars(contains('sleep')),~(.*60)) %>% glimpse()
Observations: 83
Variables: 11
$ name <chr> "Cheetah", "Owl monkey", "Mountain beaver", "Greater shor…
$ genus <chr> "Acinonyx", "Aotus", "Aplodontia", "Blarina", "Bos", "Bra…
$ vore <chr> "carni", "omni", "herbi", "omni", "herbi", "herbi", "carn…
$ order <chr> "Carnivora", "Primates", "Rodentia", "Soricomorpha", "Art…
$ conservation <chr> "lc", NA, "nt", "lc", "domesticated", NA, "vu", NA, "dome…
$ sleep_total <dbl> 726, 1020, 864, 894, 240, 864, 522, 420, 606, 180, 318, 5…
$ sleep_rem <dbl> NA, 108, 144, 138, 42, 132, 84, NA, 174, NA, 36, 48, 42, …
$ sleep_cycle <dbl> NA, NA, NA, 8, 40, 46, 23, NA, 20, NA, NA, 13, NA, 7, NA,…
$ awake <dbl> 11.9, 7.0, 9.6, 9.1, 20.0, 9.6, 15.3, 17.0, 13.9, 21.0, 1…
$ brainwt <dbl> NA, 0.01550, NA, 0.00029, 0.42300, NA, NA, NA, 0.07000, 0…
$ bodywt <dbl> 50.000, 0.480, 1.350, 0.019, 600.000, 3.850, 20.490, 0.04…
经过上面的例子我们可以看到 mutate 是非常强大的,mutate_* 系列函数还有很多高级用法,后面我们用到的时候还会接着讨论,现在拿起 mutate 把你的数据重新打造一遍吧!