# R数据分析大数据当中的化整为零（Split-Apply-Combine）策略

1. 数据需要分组处理
2. 数据需要按照每行或者每列来处理
3. 数据需要分级处理，和分组很类似，但是分级时需要考虑分级之间的关系。

> library(plyr) # need for dataset ozone> library(MASS) # need for function rlm> month <- ordered(rep(1:12, length=72)) #set time sequence> #try one set> one <- ozone[1,1,]> model <- rlm(one ~ month - 1); model Call:rlm(formula = one ~ month - 1)Converged in 9 iterations Coefficients: month1 month2 month3 month4 month5 month6 month7 month8 month9 month10 month11 month12 264.3964 259.2036 255.0000 252.0052 258.5089 265.3387 274.0000 276.6724 277.0000 285.0000 283.6036 273.1964 Degrees of freedom: 72 total; 60 residual Scale estimate: 4.45 > deseas <- resid(model)

> deseasf <- function(value) rlm(value ~ month -1) #the function> models <- as.list(rep(NA, 24*24)) #prepare the variable> dim(models) <- c(24, 24)> deseas <- array(NA, c(24,24,72)) #prepare the variable> dimnames(deseas) <- dimnames(ozone)> for (i in seq_len(24)) { #for loop for first dimension+ for(j in seq_len(24)) { #for loop for second dimension+ mod <- deseasf(ozone[i, j, ]) #apply function+ models[[i, j]] <- mod #save data+ deseas[i, j, ] <- resid(mod) #get residure+ }+ }

> x<-cbind(x1=3,x2=c(4:1,2:5))> dimnames(x)[[1]]<-letters[1:8]> x x1 x2 a 3 4b 3 3c 3 2d 3 1e 3 2f 3 3g 3 4h 3 5> apply(x,2,mean,trim =.2) #在这里，trim =.2就是mean(x, trim = 0, na.rm = FALSE, ...)函数当中的一个参数。x1 x2 3 3 > apply(x,1,mean,trim =.2) a b c d e f g h 3.5 3.0 2.5 2.0 2.5 3.0 3.5 4.0 > col.sums <- apply(x, 2, sum)> row.sums <- apply(x, 1, sum)> rbind(cbind(x, Rtot = row.sums), Ctot = c(col.sums, sum(col.sums))) x1 x2 Rtot a 3 4 7b 3 3 6c 3 2 5d 3 1 4e 3 2 5f 3 3 6g 3 4 7h 3 5 8Ctot 24 24 48> sum.plus.y <- function(x,y){+ sum(x) + y+ }> apply(x, 1, sum.plus.y, 3) #使用自定义函数 a b c d e f g h 10 9 8 7 8 9 10 11

lapply(X, FUN, ...)sapply(X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)vapply(X, FUN, FUN.VALUE, ..., USE.NAMES = TRUE)

> x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))> x\$a [1] 1 2 3 4 5 6 7 8 9 10 \$beta[1] 0.04978707 0.13533528 0.36787944 1.00000000 2.71828183 7.38905610 20.08553692 \$logic[1] TRUE FALSE FALSE TRUE > lapply(x,mean)\$a[1] 5.5 \$beta[1] 4.535125 \$logic[1] 0.5

> x<-cbind(x1=3,x2=c(4:1,2:5))> dimnames(x)[[1]]<-letters[1:8]> x x1 x2 a 3 4b 3 3c 3 2d 3 1e 3 2f 3 3g 3 4h 3 5> x<-as.data.frame(x)> as.list(x)\$x1[1] 3 3 3 3 3 3 3 3 \$x2[1] 4 3 2 1 2 3 4 5 > lapply(x,function(.ele) mean(.ele))\$x1[1] 3 \$x2[1] 3 > sapply(x,mean)x1 x2 3 3 > vapply(x,mean,1)x1 x2 3 3

> i39 <- sapply(3:9, seq)> i39[[1]][1] 1 2 3 [[2]][1] 1 2 3 4 [[3]][1] 1 2 3 4 5 [[4]][1] 1 2 3 4 5 6 [[5]][1] 1 2 3 4 5 6 7 [[6]][1] 1 2 3 4 5 6 7 8 [[7]][1] 1 2 3 4 5 6 7 8 9 > sapply(i39, fivenum) [,1] [,2] [,3] [,4] [,5] [,6] [,7][1,] 1.0 1.0 1 1.0 1.0 1.0 1[2,] 1.5 1.5 2 2.0 2.5 2.5 3[3,] 2.0 2.5 3 3.5 4.0 4.5 5[4,] 2.5 3.5 4 5.0 5.5 6.5 7[5,] 3.0 4.0 5 6.0 7.0 8.0 9> vapply(i39, fivenum,+ c(Min. = 0, "1st Qu." = 0, Median = 0, "3rd Qu." = 0, Max. = 0)) [,1] [,2] [,3] [,4] [,5] [,6] [,7]Min. 1.0 1.0 1 1.0 1.0 1.0 11st Qu. 1.5 1.5 2 2.0 2.5 2.5 3Median 2.0 2.5 3 3.5 4.0 4.5 53rd Qu. 2.5 3.5 4 5.0 5.5 6.5 7Max. 3.0 4.0 5 6.0 7.0 8.0 9

tapply(X, INDEX, FUN = NULL, ..., simplify = TRUE)mapply(FUN, ..., MoreArgs = NULL, SIMPLIFY = TRUE, USE.NAMES = TRUE)sweep(x, MARGIN, STATS, FUN="-", check.margin=TRUE, ...)

> models <- apply(ozone, 1:2, deseasf) #这里相当于for loop当中的for(i in seq_len(24)){for(j in seq_len(24)){mod<-deseasf(ozone[i,j,]); models[[i,j]]<-mod;}}, 但是运算却是并行处理的。> resids_list <- lapply(models, resid)> resids <- unlist(resids_list)> dim(resids) <- c(72, 24, 24)> deseas <- aperm(resids, c(2, 3, 1))> dimnames(deseas) <- dimnames(ozone)

R> deseasf_df <- function(df) {+ rlm(value ~ month - 1, data = df)+ }R> pieces <- split(ozonedf, list(ozonedf\$lat, ozonedf\$long))R> models <- lapply(pieces, deseasf_df)R> results <- mapply(function(model, df) {+ cbind(df[rep(1, 72), c("lat", "long")], resid(model))+ }, models, pieces)R> deseasdf <- do.call("rbind", results)

> aq<-airquality> aq\$source<-"qiuworld"> head(aq) Ozone Solar.R Wind Temp Month Day source1 41 190 7.4 67 5 1 qiuworld2 36 118 8.0 72 5 2 qiuworld3 12 149 12.6 74 5 3 qiuworld4 18 313 11.5 62 5 4 qiuworld5 NA NA 14.3 56 5 5 qiuworld6 28 NA 14.9 66 5 6 qiuworld> aq1<-aq> aq1\$source<-c("gaziou")> head(aq1) Ozone Solar.R Wind Temp Month Day source1 41 190 7.4 67 5 1 gaziou2 36 118 8.0 72 5 2 gaziou3 12 149 12.6 74 5 3 gaziou4 18 313 11.5 62 5 4 gaziou5 NA NA 14.3 56 5 5 gaziou6 28 NA 14.9 66 5 6 gaziou> set.seed(123)> x1<-runif(nrow(aq1),-0.5,0.5)> head(x1)[1] -0.21242248 0.28830514 -0.09102308 0.38301740 0.44046728 -0.45444350> aq1\$Temp<-aq1\$Temp+x1> head(aq1\$Temp)[1] 66.78758 72.28831 73.90898 62.38302 56.44047 65.54556> aq<-rbind(aq,aq1)> aq<-aq[order(aq\$Wind),]> head(aq) Ozone Solar.R Wind Temp Month Day source53 NA 59 1.7 76.00000 6 22 qiuworld206 NA 59 1.7 76.29892 6 22 gaziou121 118 225 2.3 94.00000 8 29 qiuworld274 118 225 2.3 94.14789 8 29 gaziou126 73 183 2.8 93.00000 9 3 qiuworld279 73 183 2.8 93.48422 9 3 gaziou

> pieces <- split(aq, list(aq\$source,aq\$Month))> pieces[1:2]\$gaziou.5 Ozone Solar.R Wind Temp Month Day source183 115 223 5.7 78.64711 5 30 gaziou164 7 NA 6.9 74.45683 5 11 gaziou154 41 190 7.4 66.78758 5 1 gaziou184 37 279 7.4 76.46302 5 31 gaziou155 36 118 8.0 72.28831 5 2 gaziou180 NA NA 8.0 57.04407 5 27 gaziou160 23 299 8.6 65.02811 5 7 gaziou163 NA 194 8.6 68.95661 5 10 gaziou166 11 290 9.2 66.17757 5 13 gaziou165 16 256 9.7 68.95333 5 12 gaziou173 11 44 9.7 62.45450 5 20 gaziou174 1 8 9.7 59.38954 5 21 gaziou176 4 25 9.7 61.14051 5 23 gaziou167 14 274 10.9 68.07263 5 14 gaziou157 18 313 11.5 62.38302 5 4 gaziou169 14 334 11.5 64.39982 5 16 gaziou172 30 322 11.5 67.82792 5 19 gaziou170 34 307 12.0 65.74609 5 17 gaziou177 32 92 12.0 61.49427 5 24 gaziou181 23 13 12.0 67.09414 5 28 gaziou156 12 149 12.6 73.90898 5 3 gaziou168 18 65 13.2 57.60292 5 15 gaziou161 19 99 13.8 59.39242 5 8 gaziou158 NA NA 14.3 56.44047 5 5 gaziou159 28 NA 14.9 65.54556 5 6 gaziou179 NA 266 14.9 58.20853 5 26 gaziou182 45 252 14.9 80.78916 5 29 gaziou175 11 320 16.6 73.19280 5 22 gaziou178 NA 66 16.6 57.15571 5 25 gaziou171 6 78 18.4 56.54206 5 18 gaziou162 8 19 20.1 61.05144 5 9 gaziou \$qiuworld.5 Ozone Solar.R Wind Temp Month Day source30 115 223 5.7 79 5 30 qiuworld11 7 NA 6.9 74 5 11 qiuworld1 41 190 7.4 67 5 1 qiuworld31 37 279 7.4 76 5 31 qiuworld2 36 118 8.0 72 5 2 qiuworld27 NA NA 8.0 57 5 27 qiuworld7 23 299 8.6 65 5 7 qiuworld10 NA 194 8.6 69 5 10 qiuworld13 11 290 9.2 66 5 13 qiuworld12 16 256 9.7 69 5 12 qiuworld20 11 44 9.7 62 5 20 qiuworld21 1 8 9.7 59 5 21 qiuworld23 4 25 9.7 61 5 23 qiuworld14 14 274 10.9 68 5 14 qiuworld4 18 313 11.5 62 5 4 qiuworld16 14 334 11.5 64 5 16 qiuworld19 30 322 11.5 68 5 19 qiuworld17 34 307 12.0 66 5 17 qiuworld24 32 92 12.0 61 5 24 qiuworld28 23 13 12.0 67 5 28 qiuworld3 12 149 12.6 74 5 3 qiuworld15 18 65 13.2 58 5 15 qiuworld8 19 99 13.8 59 5 8 qiuworld5 NA NA 14.3 56 5 5 qiuworld6 28 NA 14.9 66 5 6 qiuworld26 NA 266 14.9 58 5 26 qiuworld29 45 252 14.9 81 5 29 qiuworld22 11 320 16.6 73 5 22 qiuworld25 NA 66 16.6 57 5 25 qiuworld18 6 78 18.4 57 5 18 qiuworld9 8 19 20.1 61 5 9 qiuworld > avgTemp<-lapply(pieces,function(.ele) mean(.ele\$Temp))> head(avgTemp)\$gaziou.5[1] 65.63339 \$qiuworld.5[1] 65.54839 \$gaziou.6[1] 79.02871 \$qiuworld.6[1] 79.1 \$gaziou.7[1] 83.90313 \$qiuworld.7[1] 83.90323 > avgTemp<-do.call(rbind,avgTemp)> head(avgTemp) [,1]gaziou.5 65.63339qiuworld.5 65.54839gaziou.6 79.02871qiuworld.6 79.10000gaziou.7 83.90313qiuworld.7 83.90323

> avgTemp1<-ddply(aq,.(source,Month),function(.ele) mean(.ele\$Temp))> avgTemp1 source Month V11 gaziou 5 65.633392 gaziou 6 79.028713 gaziou 7 83.903134 gaziou 8 83.986115 gaziou 9 76.893196 qiuworld 5 65.548397 qiuworld 6 79.100008 qiuworld 7 83.903239 qiuworld 8 83.9677410 qiuworld 9 76.90000

plyr包给我们提供了非常简洁的书写方式。我们来看看其主要函数的定义方式。

Inputoutput

Array

Data frame

List

Array

aaply

alply

a_ply

Data frame

daply

ddply

dlply

d_ply

List

laply

ldply

llply

l_ply

a*ply(.data, .margins, .fun, ..., .progress = "none")d*ply(.data, .variables, .fun, ..., .progress = "none")l*ply(.data, .fun, ..., .progress = "none")

• .margins = 1 #以行为单位
• .margins = 2 #以列为单位
• .margins = c(1,2) #以individual cell为单位

821 篇文章169 人订阅

0 条评论

## 相关文章

1163

### 泛函编程（17）－泛函状态－State In Action

对OOP编程人员来说，泛函状态State是一种全新的数据类型。我们在上节做了些介绍，在这节我们讨论一下State类型的应用：用一个具体的例子来示范如何使...

2058

7487

3135

### Codeforces 777C Alyona and Spreadsheet

C. Alyona and Spreadsheet time limit per test:1 second memory limit per test:256...

38413

### 第八届福建省大学生程序设计竞赛 | FZU2280 Magic

Kim is a magician, he can use n kinds of magic, number from 1 to n. We use strin...

1173

2847

1520

3377

### 约瑟夫问题方法总结

n个人围成一个圈，每个人分别标注为1、2、...、n，要求从1号从1开始报数，报到k的人出圈，接着下一个人又从1开始报数，如此循环，直到只剩最后一个人时，该人即...

3258