我有一家医院的数据,有很多变量,还有每一行的from和to data,这告诉我们每一行什么时候是“有效的”。每一行的有效期最长为一年。
test = data.frame(ID=c(10,10,10,12,12), Disease=c("P","P","P","D","P"), Pass=c("US","US","US","EN","EN"),
Payment=c(110,110,115,240,255),
from_date=as.POSIXct(c("2008-01-09","2009-01-09","2010-01-09","2008-01-01","2013-12-31")),
to_date=as.POSIXct(c("2009-01-08","2010-01-08","2011-01-08","2008-12-31","2014-12-30"))
)
对于从一年到另一年的行,我希望拆分这些行,这样我就可以得到两行而不是原来的行,还可以操作from_date和to_date,这样我就可以得到一个新的数据集,如下所示:
test_desired = data.frame(ID=c(10,10,10,10,10,10,12,12,12), Disease=c("P","P","P","P","P","P","D","P","P"), Pass=c("US","US","US","US","US","US","EN","EN","EN"),
Payment=c(110,110,110,110,115,115,240,255,255),
from_date=as.POSIXct(c("2008-01-09","2009-01-01","2009-01-09","2009-01-01","2010-01-09","2011-01-01","2008-01-01","2013-12-31","2014-01-01")),
to_date=as.POSIXct(c("2008-12-31","2009-01-08","2009-12-31","2010-01-08","2010-12-31","2011-01-08","2008-12-31","2013-12-31","2014-12-30"))
)
尝试
library(lubridate) #for function "year" below
test_desired=test
row=c()
tmp=c()
for(i in 1:nrow(test_desired)){
if(year(test_desired$from_date)[i]<year(test_desired$to_date)[i]){
test_desired$to_date[i] = as.POSIXct(paste0(year(test_desired$from_date[i]),"-12-31"))
row = test_desired[i,]
row$from_date = as.POSIXct(paste0(year(test$to_date[i]),"-01-01"))
row$to_date = test$to_date[i]
tmp=rbind(tmp,row)
} else next
}
test_desired=rbind(test_desired,tmp)
library(dplyr)
test_desired=arrange(test_desired,ID,from_date)
有没有一种更优雅的方式来做这件事,比如使用dplyr?
发布于 2019-05-19 21:13:23
这是一个基于tidyverse的解决方案。它类似于Lennyy,但条件检查更少,添加时间也没有问题(它们可能会显示在tibble中,但会显示为00:00:00
)。我添加了ungroup()
,因为它听起来像是在某个地方有一个分组变量(在Lennyy的解决方案下进行注释)。如果您不这样做,则可以将其删除:
library(dplyr)
library(lubridate)
library(purrr)
test %>%
ungroup() %>% # This isn't necessary if there are no groupings.
split(rownames(test)) %>%
map_dfr(function(df){
if (year(df$from_date) == year(df$to_date)) return(df)
bind_rows(mutate(df, to_date = rollback(floor_date(to_date, "y"))),
mutate(df, from_date = floor_date(to_date, "y"))
)
}
)
#### OUTPUT ####
ID Disease Pass Payment from_date to_date
1 10 P US 110 2008-01-09 2008-12-31
2 10 P US 110 2009-01-01 2009-01-08
3 10 P US 110 2009-01-09 2009-12-31
4 10 P US 110 2010-01-01 2010-01-08
5 10 P US 115 2010-01-09 2010-12-31
6 10 P US 115 2011-01-01 2011-01-08
7 12 D EN 240 2008-01-01 2008-12-31
8 12 P EN 255 2013-12-31 2013-12-31
9 12 P EN 255 2014-01-01 2014-12-30
解释:数据帧被分成一个行列表。然后,我使用map_dfr
在from_date
和to_date
包含不同年份的每个数据帧上运行函数。map_dfr
还将结果数据帧绑定在一起。在匿名函数中,我按年填写to_date
,然后针对第一行中的新to_date
将其回滚到上个月的最后一天,或者将其保留为第二行中新from_date
的原样。
发布于 2019-05-19 18:49:58
使用from_date和to_date,我们可以使用seq.Date
创建日期序列,然后将该序列按年拆分,最后选择每年的最小和最大值。然后使用apply
、separate_rows
和separate
得到最终结果。
cr_date <- function(d1, d2){
#browser()
sequence_date <- seq.Date(as.Date(d1), as.Date(d2), by='day')
lst_dates <- lapply(split(sequence_date, lubridate::year(sequence_date)),
function(x) paste0(min(x), '|', max(x)))
result <- paste0(lst_dates, collapse = ';')
return(result)
}
#Test
#cr_date(as.Date('2008-01-09'),as.Date('2009-01-08'))
test$flag <- apply(test, 1, function(x) cr_date(x['from_date'], x['to_date']))
library(tidyr)
separate_rows(test, flag, sep=';') %>%
separate(flag, into = c('from_date_new','to_date_new'), '\\|') %>%
mutate_at(vars('from_date_new','to_date_new'), list(~as.Date(.)))
ID Disease Pass Payment from_date to_date from_date_new to_date_new
1 10 P US 110 2008-01-09 2009-01-08 2008-01-09 2008-12-31
2 10 P US 110 2008-01-09 2009-01-08 2009-01-01 2009-01-08
3 10 P US 110 2009-01-09 2010-01-08 2009-01-09 2009-12-31
4 10 P US 110 2009-01-09 2010-01-08 2010-01-01 2010-01-08
5 10 P US 115 2010-01-09 2011-01-08 2010-01-09 2010-12-31
6 10 P US 115 2010-01-09 2011-01-08 2011-01-01 2011-01-08
7 12 D EN 240 2008-01-01 2008-12-31 2008-01-01 2008-12-31
8 12 P EN 255 2013-12-31 2014-12-30 2013-12-31 2013-12-31
9 12 P EN 255 2013-12-31 2014-12-30 2014-01-01 2014-12-30
发布于 2019-05-20 01:09:38
这只使用了基数R。
首先请注意,只使用不带时间的日期,所以我们应该使用Date
类,而不是POSIXct
。后者可能会引入不必要的时区错误,除非您非常小心,因此在末尾的说明中,我们假设我们从包含Date
类数据的test2
开始。注释中的代码还显示了如何将其转换为Date
类(如果它已经POSIXct
)。
在给定test2
的情况下,我们添加from_year
、to_year
和eoy
(年底日期)列,以提供test3
。然后我们迭代各行,如果年份相同,则返回行,如果不相同,则返回拆分的行。这给出了我们rbind
在一起的单行和双行数据帧的列表。
test3 <- transform(test2,
from_year = format(from_date, "%Y"),
to_year = format(to_date, "%Y"),
eoy = as.Date(sub("-.*", "-12-31", from_date)))
nr <- nrow(test2)
do.call("rbind", lapply(1:nr, function(i) with(test3[i, ],
if (from_year == to_year) test2[i, ]
else data.frame(ID, Disease, Pass, Payment,
from_date = c(from_date, eoy+1),
to_date = c(eoy, to_date)))
))
备注
假设输入为可重现的形式。如上所述,它使用Date
类。
test2 <- transform(test,
from_date = as.Date(from_date),
to_date = as.Date(to_date))
https://stackoverflow.com/questions/56206794
复制相似问题