文章/答案/技术大牛

发布

社区首页 >问答首页 >来自长格式数据的几个因素的计数和百分比

问来自长格式数据的几个因素的计数和百分比
EN

Stack Overflow用户

提问于 2021-06-28 15:02:41

回答 1查看 67关注 0票数 0

对于当前的一个涉及重复测量的项目，我第一次使用了一个很长的数据集。

我正在尝试获取几个分类变量的每个时间点的描述性统计数据(计数、百分比)。

我的数据：

library(dplyr)

questiondata <- structure(list(id = c(2, 2, 6, 6, 9, 9, 22, 22, 23, 23, 25, 25, 
30, 30, 31, 31, 33, 33, 34, 34), time = structure(c(1L, 2L, 1L, 
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
2L), .Label = c("time1", "time2"), class = "factor"), age = c(65, 
69.17, 76.75, 81.05, 58.64, 62.71, 59.37, 63.56, 58, 61.69, 55.78, 
59.95, 59.3, 63.36, 60.45, 64.39, 56.3, 60.08, 59.53, 63.84), 
    sex = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("men", 
    "women"), class = "factor"), hypert_drug = structure(c(1L, 
    2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
    1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")), row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

它对应于下面的tibble：

# A tibble: 20 x 5
      id time    age sex   hypert_drug
   <dbl> <fct> <dbl> <fct> <fct>      
 1     2 time1  65   men   no         
 2     2 time2  69.2 men   yes        
 3     6 time1  76.8 women yes        
 4     6 time2  81.0 women yes        
 5     9 time1  58.6 men   no         
 6     9 time2  62.7 men   no         
 7    22 time1  59.4 men   no         
 8    22 time2  63.6 men   no         
 9    23 time1  58   women no         
10    23 time2  61.7 women no         
11    25 time1  55.8 men   no         
12    25 time2  60.0 men   no         
13    30 time1  59.3 women no         
14    30 time2  63.4 women yes        
15    31 time1  60.4 men   yes        
16    31 time2  64.4 men   yes        
17    33 time1  56.3 men   no         
18    33 time2  60.1 men   no         
19    34 time1  59.5 women no         
20    34 time2  63.8 women no

要获得每次我没有发生性行为的次数：

long_dataset %>% 
  group_by(time, sex) %>% 
  summarize(n_sex=n())

这将产生以下输出：

summarise()` has grouped output by 'time'. You can override using the `.groups` argument.
# A tibble: 10 x 3
# Groups:   time [5]
   time  sex   n_sex
   <fct> <fct> <int>
 1 time1 men     398
 2 time1 women   371
 3 time2 men     398
 4 time2 women   371
 5 time3 men     398
 6 time3 women   371
 7 time4 men     804
 8 time4 women   917
 9 time5 men    1202
10 time5 women  1288

我正在尝试做的也是获得一个列，用于表示每个时间点的男性和女性比例，以及类似的列，用于描述变量'hypert_drug‘的每个时间点的计数和百分比。

有什么想法吗？谢谢!

categorical-data

dplyr

count

回答 1

Stack Overflow用户

发布于 2021-06-28 15:27:30

按照您的示例long_dataset。只需扩展您的dplyr链。

library(dplyr)
long_dataset <- structure(list(id = c(2, 2, 6, 6, 9, 9, 22, 22, 23, 23, 25, 25, 
                                      30, 30, 31, 31, 33, 33, 34, 34), time = structure(c(1L, 2L, 1L, 
                                                                                          2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
                                                                                          2L), .Label = c("time1", "time2"), class = "factor"), age = c(65, 
                                                                                                                                                        69.17, 76.75, 81.05, 58.64, 62.71, 59.37, 63.56, 58, 61.69, 55.78, 
                                                                                                                                                        59.95, 59.3, 63.36, 60.45, 64.39, 56.3, 60.08, 59.53, 63.84), 
                               sex = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
                                                 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("men", 
                                                                                                     "women"), class = "factor"), hypert_drug = structure(c(1L, 
                                                                                                                                                            2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
                                                                                                                                                            1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")), row.names = c(NA, 
                                                                                                                                                                                                                                        -20L), class = c("tbl_df", "tbl", "data.frame"))

long_dataset %>% 
  dplyr::group_by(time,sex,hypert_drug) %>% 
  dplyr::summarise(count = n()) %>%
  dplyr::mutate(count_freq = count / sum(count))

#> # A tibble: 8 x 5
#> # Groups:   time, sex [4]
#>   time  sex   hypert_drug count count_freq
#>   <fct> <fct> <fct>       <int>      <dbl>
#> 1 time1 men   no              5      0.833
#> 2 time1 men   yes             1      0.167
#> 3 time1 women no              3      0.75 
#> 4 time1 women yes             1      0.25 
#> 5 time2 men   no              4      0.667
#> 6 time2 men   yes             2      0.333
#> 7 time2 women no              2      0.5  
#> 8 time2 women yes             2      0.5
Created on 2021-06-28 by the reprex package (v0.3.0)

更新

不确定如何在单个dplyr链中做到这一点。这是一个三重dplyr链。也许其他人做得更好。我希望，我理解你的输出是正确的。

library(dplyr)
long_dataset <- structure(list(id = c(2, 2, 6, 6, 9, 9, 22, 22, 23, 23, 25, 25, 
                                      30, 30, 31, 31, 33, 33, 34, 34), time = structure(c(1L, 2L, 1L, 
                                                                                          2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
                                                                                          2L), .Label = c("time1", "time2"), class = "factor"), age = c(65, 
                                                                                                                                                        69.17, 76.75, 81.05, 58.64, 62.71, 59.37, 63.56, 58, 61.69, 55.78, 
                                                                                                                                                        59.95, 59.3, 63.36, 60.45, 64.39, 56.3, 60.08, 59.53, 63.84), 
                               sex = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 
                                                 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("men", 
                                                                                                     "women"), class = "factor"), hypert_drug = structure(c(1L, 
                                                                                                                                                            2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
                                                                                                                                                            1L, 1L, 1L, 1L), .Label = c("no", "yes"), class = "factor")), row.names = c(NA, 
                                                                                                                                                                                                                                        -20L), class = c("tbl_df", "tbl", "data.frame"))

sex <- long_dataset %>%
  dplyr::group_by(time,sex) %>%
  dplyr::summarise(n_sex = dplyr::n()) %>%
  dplyr::mutate(freq_sex = n_sex / sum(n_sex)) %>%
  dplyr::ungroup()


drug <- long_dataset %>%
  dplyr::group_by(time,hypert_drug) %>%
  dplyr::summarise(n_drug = dplyr::n()) %>%
  dplyr::mutate(freq_drug = n_drug / sum(n_drug)) %>%
  dplyr::ungroup() %>%
  dplyr::select(-time)
  
dplyr::bind_cols(sex,drug)
#> # A tibble: 4 x 7
#>   time  sex   n_sex freq_sex hypert_drug n_drug freq_drug
#>   <fct> <fct> <int>    <dbl> <fct>        <int>     <dbl>
#> 1 time1 men       6      0.6 no               8       0.8
#> 2 time1 women     4      0.4 yes              2       0.2
#> 3 time2 men       6      0.6 no               6       0.6
#> 4 time2 women     4      0.4 yes              4       0.4
Created on 2021-06-29 by the reprex package (v0.3.0)

票数 0

页面原文内容由Stack Overflow提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://stackoverflow.com/questions/68158917

复制

相似问题

问来自长格式数据的几个因素的计数和百分比
EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问来自长格式数据的几个因素的计数和百分比EN

回答 1

Stack Overflow用户

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问来自长格式数据的几个因素的计数和百分比
EN