你好,我正在寻找一种从data.table
中选择POSIXct
行的有效方法,以便一天中的时间小于12:00:00
(注意,毫秒不是必需的,因此我们可以使用ITime
)
set.seed(1); N = 1e7;
DT = data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
DT
dts
# 1: 1969-12-31 06:35:54.618925
# 2: 1970-01-01 05:06:04.332422
# ---
# 9999999: 1970-01-03 00:37:00.035565
#10000000: 1969-12-30 08:30:23.624506
一种解决方案(这里的问题是,如果N很大,则强制转换可能会很昂贵)
f <- function(t, st, et) {time <- as.ITime(t); return(time>=as.ITime(st) & time<=as.ITime(et))}
P <- function(t, s) { #geekTrader solution
ep <- .parseISO8601(s)
if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
first.time <- as.double(ep$first.time)
last.time <- as.double(ep$last.time)-31449600
SecOfDay <- as.double(t) %% 86400
return(SecOfDay >= first.time & SecOfDay <= last.time )
} else {
return(t >= ep$first.time & t <= ep$last.time)
}
}
快速查看性能
system.time(resf <- DT[f(dts,'00:00:00','11:59:59')])
user system elapsed
1.01 0.28 1.29
system.time(resP <- DT[P(dts,'T00:00:00/T11:59:59')])
user system elapsed
0.64 0.13 0.76
identical(resf,resP)
[1] TRUE
发布于 2013-04-05 20:07:29
P <- function(t, s) {
ep <- .parseISO8601(s)
if(grepl('T[0-9]{2}:[0-9]{2}:[0-9]{2}/T[0-9]{2}:[0-9]{2}:[0-9]{2}', s)){
first.time <- as.double(ep$first.time)
last.time <- as.double(ep$last.time)-31449600
SecOfDay <- as.double(t) %% 86400
return(SecOfDay >= first.time & SecOfDay <= last.time )
} else {
return(t >= ep$first.time & t <= ep$last.time)
}
}
F <- function(t, st, et) {
time <- as.ITime(t)
return(time>=as.ITime(st) & time<=as.ITime(et))
}
Sys.setenv(TZ='GMT')
N = 1e7;
set.seed(1);
DT <- data.table(dts = .POSIXct(1e5*rnorm(N), tz="GMT"))
system.time(resP <- DT[P(dts, 'T00:00:00/T12:00:00'), ])
## user system elapsed
## 1.11 0.11 1.22
system.time(resF <- DT[F(dts,'00:00:00','12:00:00')])
## user system elapsed
## 2.22 0.29 2.51
resP
## dts
## 1: 1969-12-31 06:35:54
## 2: 1970-01-01 05:06:04
## 3: 1969-12-31 00:47:17
## 4: 1970-01-01 09:09:10
## 5: 1969-12-31 01:12:33
## ---
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
resF
## dts
## 1: 1969-12-31 06:35:54
## 2: 1970-01-01 05:06:04
## 3: 1969-12-31 00:47:17
## 4: 1970-01-01 09:09:10
## 5: 1969-12-31 01:12:33
## ---
##5000672: 1970-01-01 06:08:15
##5000673: 1970-01-01 05:02:27
##5000674: 1969-12-31 02:25:24
##5000675: 1970-01-03 00:37:00
##5000676: 1969-12-30 08:30:23
#Check the correctness
resP[,list(mindts=max(dts)),by=list(as.Date(dts))]
## as.Date mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55
resF[,list(mindts=max(dts)),by=list(as.Date(dts))]
## as.Date mindts
## 1: 1969-12-31 1969-12-31 12:00:00
## 2: 1970-01-01 1970-01-01 12:00:00
## 3: 1969-12-29 1969-12-29 12:00:00
## 4: 1970-01-02 1970-01-02 12:00:00
## 5: 1969-12-30 1969-12-30 12:00:00
## 6: 1970-01-03 1970-01-03 12:00:00
## 7: 1970-01-04 1970-01-04 11:59:59
## 8: 1970-01-05 1970-01-05 11:59:45
## 9: 1969-12-28 1969-12-28 12:00:00
##10: 1969-12-27 1969-12-27 11:59:21
##11: 1970-01-06 1970-01-06 10:53:21
##12: 1969-12-26 1969-12-26 10:15:03
##13: 1970-01-07 1970-01-07 08:21:55
下面是一些不错的xts
样式子设置的演示
DT[P(dts, '1970')]
## dts
## 1: 1970-01-01 05:06:04
## 2: 1970-01-02 20:18:48
## 3: 1970-01-01 09:09:10
## 4: 1970-01-01 13:32:22
## 5: 1970-01-01 20:30:32
## ---
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
DT[P(dts, '197001')]
## dts
## 1: 1970-01-01 05:06:04
## 2: 1970-01-02 20:18:48
## 3: 1970-01-01 09:09:10
## 4: 1970-01-01 13:32:22
## 5: 1970-01-01 20:30:32
## ---
##5001741: 1970-01-02 15:51:12
##5001742: 1970-01-03 01:41:31
##5001743: 1970-01-01 06:08:15
##5001744: 1970-01-01 05:02:27
##5001745: 1970-01-03 00:37:00
DT[P(dts, '19700102')]
## dts
## 1: 1970-01-02 20:18:48
## 2: 1970-01-02 17:59:38
## 3: 1970-01-02 07:14:53
## 4: 1970-01-02 02:13:03
## 5: 1970-01-02 01:31:37
## ---
##1519426: 1970-01-02 11:25:24
##1519427: 1970-01-02 10:00:21
##1519428: 1970-01-02 05:21:25
##1519429: 1970-01-02 05:11:26
##1519430: 1970-01-02 15:51:12
DT[P(dts, '19700102 00:00:00/19700103 12:00:00')]
## dts
## 1: 1970-01-02 20:18:48
## 2: 1970-01-02 17:59:38
## 3: 1970-01-02 07:14:53
## 4: 1970-01-02 02:13:03
## 5: 1970-01-02 01:31:37
## ---
##1785762: 1970-01-02 05:21:25
##1785763: 1970-01-02 05:11:26
##1785764: 1970-01-02 15:51:12
##1785765: 1970-01-03 01:41:31
##1785766: 1970-01-03 00:37:00
#Check the correctness again
DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), max(dts)]
##[1] "1970-01-03 12:00:00 GMT"
DT[P(dts, '19700102 00:00:00/19700103 12:00:00'), min(dts)]
##[1] "1970-01-02 00:00:00 GMT"
发布于 2013-04-11 16:04:24
执行此操作的规范方法是转换为POSIXlt
并提取hour组件。
hour(as.POSIXlt(DT$dts, "GMT")) < 12
这在性能上似乎可以与所讨论的其他技术相媲美(并且更容易理解)。
发布于 2013-04-16 10:27:41
这是一个较晚的条目,但我认为as.POSIXlt
解决方案将创建一个已命名的向量列表,其中您只需要一个小时
我会键入一个ITime
列,然后使用二进制搜索来对这些时间进行子集,直到中午12点
12 up之前有60*60 *12 - 1
秒数,因此seq_len(43199)
将返回12 up之前(但不包括在内)的所有内容
# create IDate and ITime columns and key by time
setkey(DT[, c('Date','Time') := IDateTime(dts)],Time)
# subset times before 12pm
DT[.(seq_len(43199))]
https://stackoverflow.com/questions/15830341
复制相似问题