R语言包_plyr

plyr: The split-apply-combine strategy for R

不再是循环,而是向量操作,这个包的目的是简化apply类函数。 其相当于splitapply函数的整合。

基础

R函数和plyr

#split
pieces = split(baseball[,6:9],baseball$year)
#apply
results = vector("list",length(pieces))
names = names(pieces)
for (i in seq(1,length(pieces))) {
    piece = pieces[[i]]
    results[[i]] = colMeans(piece)
}
#conbine
result = do.call("rbind",results)
result = as.data.frame(result)
result$name = names(pieces)

# an easy way
result2 = ddply(baseball,"year",function(df) colMeans(df[,6:9]))

# contrast
head(result2)
head(result)

plyr包中一些有用的函数

# each 将多个函数放在一起输出
each(min, max)(1:10)
each(length, mean, var)(rnorm(100))
each("min", "max")(1:10)
each(c("min", "max"))(1:10)
each(c(min, max))(1:10)
# colwise 将原来只能计算vector的函数转化为可以计算df的列
nmissing <- function(x) sum(is.na(x))
colwise(nmissing)(baseball)
ddply(baseball, .(year), colwise(nmissing))
ddply(baseball, .(year), colwise(nmissing, c("sb", "cs", "so")))
ddply(baseball, .(year), colwise(nmissing, ~ sb + cs + so))
ddply(baseball, .(year), colwise(nmissing, is.character))
ddply(baseball, .(year), colwise(nmissing, is.numeric))
ddply(baseball, .(year), colwise(nmissing, is.discrete))
ddply(baseball, .(year), numcolwise(nmissing))
ddply(baseball, .(year), catcolwise(nmissing))
numcolwise(mean)(baseball, na.rm = TRUE)
numcolwise(mean, na.rm = TRUE)(baseball)
# arrange 省略了order繁琐的步骤,可以给df快速排序
mtcars[with(mtcars, order(cyl, disp)), ]
arrange(mtcars, cyl, disp)
myCars = cbind(vehicle=row.names(mtcars), mtcars)
arrange(myCars, cyl, disp)
arrange(myCars, cyl, desc(disp))
# rename 可以根据变量名而不是变量位置重新命名
x <- c("a" = 1, "b" = 2, d = 3, 4)
x <- rename(x, replace = c("d" = "c"))
rename(mtcars, c("disp" = "displacement"))
# count 等效as.data.frame(table(x))
count(baseball[1:100,], vars = "id")
count(baseball[1:100,], vars = "id", wt_var = "g")
count(baseball[1:100,], c("id", "year"))
# match_df 配合count,选出符合条件的行
longterm <- subset(count(baseball, "id"), freq > 25)
bb_longterm <- match_df(baseball, longterm, on="id")
# join 类似sql中的join,比merge速度更快
first <- ddply(baseball, "id", summarise, first = min(year))
system.time(b2 <- merge(baseball, first, by = "id", all.x = TRUE))
system.time(b3 <- join(baseball, first, by = "id"))

R程序

# a simple example
set.seed(1)
d = data.frame(year=rep(2000:2002,each=3), count=round(runif(9,0,20)))
d
ddply(d,"year",function(x) {
    mean.count = mean(x$count)
    sd.count = sd(x$count)
    cv = sd.count/mean.count
    data.frame(cv.count=cv)
})

# transform summarise mutate(like transform)
ddply(d,"year",summarise,mu=mean(count),sigma=sd(count),cv=sigma/mu)
ddply(d,"year",transform,mu=mean(count),sigma=sd(count))
ddply(d,"year",mutate,mu=mean(count),sigma=sd(count),cv=sigma/mu)

# build seperate models
model = function(df) {
    lm(hwy~year,data=df)
}
models = dlply(mpg,.(cyl),model)
coefs = ldply(models,function(x) coef(x))

# plot
opar = par()
par(opar)
par(mfrow=c(1,3), mar=c(2,2,1,1), oma=c(3,3,0,0))
d_ply(d,"year",summarise,plot(count,main=unique(year),type="o"))
mtext("count",side=1,outer=T,line=1)
mtext("frequency",side=2,outer=T,line=1)

library(ggplot2)
ggplot(d,aes(x=year,y=count)) + geom_line() + facet_grid(year~.)

# nested chunking of the data
baseball.dat = subset(baseball,year>2000)
head(baseball.dat)
x = ddply(baseball.dat,c("year","team"),summarize,homeruns=sum(hr))
head(x)

# deal with errors
f = function(x) if(x==1) stop("error!") else 1
safe.f = failwith(NA,f,quiet = T)
llply(1:2,f)
llply(1:2,safe.f)

# parallel processing
x = c(1:10)
wait = function(i) Sys.sleep(0.1)
system.time(llply(x,wait))
system.time(sapply(x,wait))
install.packages("doMC")
library(doMC)
registerDoMC(2)
system.time(llply(x,wait,.parallel=T))

# plyr flaws: low speed than build-in function
system.time(ddply(baseball,"id",summarize,length(year)))
system.time(tapply(baseball$year,baseball$id,function(x) length(x)))

参考资料

Sean Anderson 的R教程

本文参与腾讯云自媒体分享计划,欢迎正在阅读的你也加入,一起分享。

发表于

我来说两句

0 条评论
登录 后参与评论

扫码关注云+社区

领取腾讯云代金券