我从课间开始和结束的学校日历中收集了以下数据集(Break_data
):
print(Break_data)
Start End Break Year
1 2016-02-24 2016-02-29 Spring_Break 2016
2 2016-03-23 2016-03-28 Easter_Recess 2016
3 2016-10-05 2016-10-10 Mid_Term_Break 2016
4 2017-03-01 2017-03-06 Spring_Break 2017
5 2017-04-12 2017-04-17 Easter_Recess 2017
6 2017-10-04 2017-10-09 Mid_Term_Break 2017
7 2018-02-28 2018-03-05 Spring_Break 2018
8 2018-03-28 2018-04-02 Easter_Recess 2018
这是一个非常大的数据集
head(df$date)
[1] "2016-02-05" "2016-02-05" "2016-02-05" "2016-02-05" "2016-02-05" "2016-02-05"
tail(df$date)
[1] "2018-07-12" "2018-07-12" "2018-07-12" "2018-07-12" "2018-07-12" "2018-07-12"
遵循中提供的步骤:https://stackoverflow.com/a/51052626/9341589
我想创建一个相似的因子变量Break,通过比较一系列数据集df
(即,除了从2016-02-05
到2018-07-12
的date之外还包括许多变量)-采样间隔是15分钟(即一天是96行)。
在我的示例中,除了表中显示的这些值之外,我还希望将不属于这些日期的Start
和End
的值视为Non_Break
days。
在上面提到的链接中的步骤之后,这是R中代码的修改版本:
Break_data$Start <- ymd(Break_data$Start)
Break_data$End <- ymd(Break_data$End)
df$date <- ymd(df$date)
LU <- Map(`:`, Break_data$Start, Break_data$End)
LU <- data.frame(value = unlist(LU),
index = rep(seq_along(LU), lapply(LU, length)))
df$Break <- Break_data$Break[LU$index[match(df$date, LU$value)]]
我假设除此之外,我还必须在for loop
或简单的if
函数中为不在开始和结束范围内的时间段提供Non_Break
。
编辑:我用两种不同的方式尝试了:
优先-不使用映射
for (i in c(1:nrow(df))){
if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29")
df$Break[i]<-"Spring_Break"
else if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28")
df$Break[i]<-"Easter_Recess"
else if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10")
df$Break[i]<-"Mid_Term_Break"
else if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06")
df$Break[i]<-"Spring_Break"
else if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17")
df$Break[i]<-"Easter_Recess"
else if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09")
df$Break[i]<-"Mid_Term_Break"
else if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05")
df$Break[i]<-"Easter_Recess"
else if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02")
df$Break[i]<-"Easter_Recess"
else (df$Break[i]<-"Not_Break")
}
第一个是永远运行:),我得到了两个值,分别是,Not_Break,和Spring_Break.
下面是警告消息:
Warning messages:
1: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
2: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
3: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
4: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
5: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
6: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
7: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
8: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
the condition has length > 1 and only the first element will be used
9: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
10: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
11: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
12: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
13: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
14: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
15: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
16: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
the condition has length > 1 and only the first element will be used
17: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
18: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
19: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
20: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
21: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
22: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
23: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
24: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
the condition has length > 1 and only the first element will be used
25: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
26: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
27: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
28: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
29: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
30: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
31: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
32: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
the condition has length > 1 and only the first element will be used
33: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
34: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
35: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
36: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
37: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
38: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
39: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
40: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
the condition has length > 1 and only the first element will be used
41: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
42: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
43: In if (df$date[i] >= "2016-10-05" & df$date <= "2016-10-10") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
44: In if (df$date[i] >= "2017-03-01" & df$date <= "2017-03-06") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
45: In if (df$date[i] >= "2017-04-12" & df$date <= "2017-04-17") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
46: In if (df$date[i] >= "2017-10-04" & df$date <= "2017-10-09") df$Break[i] <- "Mid_Term_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
47: In if (df$date[i] >= "2018-02-28" & df$date <= "2018-03-05") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
48: In if (df$date[i] >= "2018-03-28" & df$date <= "2018-04-02") df$Break[i] <- "Easter_Recess" else (df$Break[i] <- "Not_Break") :
the condition has length > 1 and only the first element will be used
49: In if (df$date[i] >= "2016-02-24" & df$date <= "2016-02-29") df$Break[i] <- "Spring_Break" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
50: In if (df$date[i] >= "2016-03-23" & df$date <= "2016-03-28") df$Break[i] <- "Easter_Recess" else if (df$date[i] >= ... :
the condition has length > 1 and only the first element will be used
SECOND -添加到链接中的代码:
LU <- Map(`:`, Break_data$Start, Break_data$End)
LU <- data.frame(value = unlist(LU),
index = rep(seq_along(LU), lapply(LU, length)))
for (i in c(1:nrow(df))){
if (df$Break <- Break_data$Break[LU$index[match(df$date, LU$value)]])
else (df$date[i] >= "2016-02-05" & df$date <= "2018-07-12")
df$Break[i]<-"Not_Break"
}
在第二个例子中,我也得到了一个错误。支持对代码或实现(使用R或Python)进行的任何修改
有没有更有效的方法来做到这一点?
注意:数据集可在以下网址公开获得:https://github.com/tomiscat/data
发布于 2018-08-17 10:44:46
library(lubridate)
# data
Break_data <- data.table::fread(
" Start End Break Year
2016-02-24 2016-02-29 Spring_Break 2016
2016-03-23 2016-03-28 Easter_Recess 2016
2016-10-05 2016-10-10 Mid_Term_Break 2016
2017-03-01 2017-03-06 Spring_Break 2017
2017-04-12 2017-04-17 Easter_Recess 2017
2017-10-04 2017-10-09 Mid_Term_Break 2017
2018-02-28 2018-03-05 Spring_Break 2018
2018-03-28 2018-04-02 Easter_Recess 2018"
)
df <- data.frame(
date = c("2016-02-05","2016-02-05", "2016-02-05" ,"2016-02-05", "2016-02-05", "2016-02-05",
"2016-02-26", "2016-10-07", "2018-03-30",
"2018-07-12","2018-07-12", "2018-07-12", "2018-07-12", "2018-07-12" ,"2018-07-12")
)
# mapping
Break_data$Start <- ymd(Break_data$Start)
Break_data$End <- ymd(Break_data$End)
df$date <- ymd(df$date)
LU <- Map(`:`, Break_data$Start, Break_data$End)
LU <- data.frame(value = unlist(LU),
index = rep(seq_along(LU), lapply(LU, length)))
df$Break <- Break_data$Break[LU$index[match(df$date, LU$value)]]
# if not mapped(df$Break ==NA), then set it to "Non_break"
df$Break <- ifelse(is.na(df$Break), "Non_Break", df$Break)
df$Break <- factor(df$Break)
df
#> date Break
#> 1 2016-02-05 Non_Break
#> 2 2016-02-05 Non_Break
#> 3 2016-02-05 Non_Break
#> 4 2016-02-05 Non_Break
#> 5 2016-02-05 Non_Break
#> 6 2016-02-05 Non_Break
#> 7 2016-02-26 Spring_Break
#> 8 2016-10-07 Mid_Term_Break
#> 9 2018-03-30 Easter_Recess
#> 10 2018-07-12 Non_Break
#> 11 2018-07-12 Non_Break
#> 12 2018-07-12 Non_Break
#> 13 2018-07-12 Non_Break
#> 14 2018-07-12 Non_Break
#> 15 2018-07-12 Non_Break
由reprex package创建于2018-08-19 (v0.2.0)。
编辑:完整解决方案
https://stackoverflow.com/questions/51887163
复制相似问题