library(plyr)
1 aaply/ adply/ alply
Split array, apply function, and return results in an array/data frame/list
dim(ozone)
## [1] 24 24 72
aaply(ozone, 1, mean)
## -21.2 -18.7 -16.2 -13.7 -11.2 -8.7 -6.2 -3.7
## 266.8194 263.0104 260.6493 258.8148 257.8657 256.9306 256.1007 255.6238
## -1.2 1.3 3.8 6.3 8.7 11.2 13.7 16.2
## 255.5081 255.0718 254.1771 254.5139 256.0729 258.8160 261.3009 263.7072
## 18.7 21.2 23.7 26.2 28.7 31.2 33.7 36.2
## 266.4005 269.9294 273.9062 279.5926 285.3356 293.2234 300.2546 308.7153
aaply(ozone, 1, mean, .drop = FALSE)
##
## lat 1
## -21.2 266.8194
## -18.7 263.0104
## -16.2 260.6493
## -13.7 258.8148
## -11.2 257.8657
## -8.7 256.9306
## -6.2 256.1007
## -3.7 255.6238
## -1.2 255.5081
## 1.3 255.0718
## 3.8 254.1771
## 6.3 254.5139
## 8.7 256.0729
## 11.2 258.8160
## 13.7 261.3009
## 16.2 263.7072
## 18.7 266.4005
## 21.2 269.9294
## 23.7 273.9062
## 26.2 279.5926
## 28.7 285.3356
## 31.2 293.2234
## 33.7 300.2546
## 36.2 308.7153
aaply(ozone, 1, each(min, max))
##
## lat min max
## -21.2 242 312
## -18.7 240 288
## -16.2 240 282
## -13.7 238 280
## -11.2 238 280
## -8.7 240 278
## -6.2 236 278
## -3.7 234 280
## -1.2 232 280
## 1.3 232 278
## 3.8 234 282
## 6.3 234 286
## 8.7 236 282
## 11.2 236 284
## 13.7 238 286
## 16.2 240 292
## 18.7 244 294
## 21.2 242 302
## 23.7 250 308
## 26.2 256 322
## 28.7 250 330
## 31.2 264 350
## 33.7 266 360
## 36.2 268 390
adply(ozone, 1, mean)
## lat V1
## 1 -21.2 266.8194
## 2 -18.7 263.0104
## 3 -16.2 260.6493
## 4 -13.7 258.8148
## 5 -11.2 257.8657
## 6 -8.7 256.9306
## 7 -6.2 256.1007
## 8 -3.7 255.6238
## 9 -1.2 255.5081
## 10 1.3 255.0718
## 11 3.8 254.1771
## 12 6.3 254.5139
## 13 8.7 256.0729
## 14 11.2 258.8160
## 15 13.7 261.3009
## 16 16.2 263.7072
## 17 18.7 266.4005
## 18 21.2 269.9294
## 19 23.7 273.9062
## 20 26.2 279.5926
## 21 28.7 285.3356
## 22 31.2 293.2234
## 23 33.7 300.2546
## 24 36.2 308.7153
alply(ozone, 1, quantile)
## $`1`
## 0% 25% 50% 75% 100%
## 242 258 266 278 312
##
## $`2`
## 0% 25% 50% 75% 100%
## 240 256 262 272 288
##
## $`3`
## 0% 25% 50% 75% 100%
## 240 254 260 268 282
##
## $`4`
## 0% 25% 50% 75% 100%
## 238 252 258 266 280
##
## $`5`
## 0% 25% 50% 75% 100%
## 238 252 258 264 280
##
## $`6`
## 0% 25% 50% 75% 100%
## 240 250 256 264 278
##
## $`7`
## 0% 25% 50% 75% 100%
## 236 248 254 264 278
##
## $`8`
## 0% 25% 50% 75% 100%
## 234 248 254 264 280
##
## $`9`
## 0% 25% 50% 75% 100%
## 232 248 254 262 280
##
## $`10`
## 0% 25% 50% 75% 100%
## 232 248 254 262 278
##
## $`11`
## 0% 25% 50% 75% 100%
## 234 248 252 260 282
##
## $`12`
## 0% 25% 50% 75% 100%
## 234 248 254 260 286
##
## $`13`
## 0% 25% 50% 75% 100%
## 236 250 255 262 282
##
## $`14`
## 0% 25% 50% 75% 100%
## 236 252 258 266 284
##
## $`15`
## 0% 25% 50% 75% 100%
## 238 254 260 268 286
##
## $`16`
## 0% 25% 50% 75% 100%
## 240 256 264 272 292
##
## $`17`
## 0% 25% 50% 75% 100%
## 244 258 266 274 294
##
## $`18`
## 0% 25% 50% 75% 100%
## 242 260 270 278 302
##
## $`19`
## 0% 25% 50% 75% 100%
## 250 264 274 282 308
##
## $`20`
## 0% 25% 50% 75% 100%
## 256 270 278 290 322
##
## $`21`
## 0% 25% 50% 75% 100%
## 250 274 284 294 330
##
## $`22`
## 0% 25% 50% 75% 100%
## 264 282 290 304 350
##
## $`23`
## 0% 25% 50% 75% 100%
## 266 286 296 314 360
##
## $`24`
## 0% 25% 50% 75% 100%
## 268 292 304 324 390
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
## lat
## 1 -21.2
## 2 -18.7
## 3 -16.2
## 4 -13.7
## 5 -11.2
## 6 -8.7
## 7 -6.2
## 8 -3.7
## 9 -1.2
## 10 1.3
## 11 3.8
## 12 6.3
## 13 8.7
## 14 11.2
## 15 13.7
## 16 16.2
## 17 18.7
## 18 21.2
## 19 23.7
## 20 26.2
## 21 28.7
## 22 31.2
## 23 33.7
## 24 36.2
2 daply/ ddply/ dlply
Split data frame, apply function, and return results in an array/ data.frame/ list
daply(mtcars, .(cyl), nrow)
## 4 6 8
## 11 7 14
daply(mtcars, .(cyl), colwise(mean))
##
## cyl mpg disp hp drat wt qsec vs
## 4 26.66364 105.1364 82.63636 4.070909 2.285727 19.13727 0.9090909
## 6 19.74286 183.3143 122.2857 3.585714 3.117143 17.97714 0.5714286
## 8 15.1 353.1 209.2143 3.229286 3.999214 16.77214 0
##
## cyl am gear carb
## 4 0.7272727 4.090909 1.545455
## 6 0.4285714 3.857143 3.428571
## 8 0.1428571 3.285714 3.5
daply(mtcars, .(cyl), function(df) colwise(mean)(df[,c(1,3,4,6)]))
##
## cyl mpg disp hp wt
## 4 26.66364 105.1364 82.63636 2.285727
## 6 19.74286 183.3143 122.2857 3.117143
## 8 15.1 353.1 209.2143 3.999214
ddply(mtcars, .(cyl),colwise(mean))
## cyl mpg disp hp drat wt qsec vs
## 1 4 26.66364 105.1364 82.63636 4.070909 2.285727 19.13727 0.9090909
## 2 6 19.74286 183.3143 122.28571 3.585714 3.117143 17.97714 0.5714286
## 3 8 15.10000 353.1000 209.21429 3.229286 3.999214 16.77214 0.0000000
## am gear carb
## 1 0.7272727 4.090909 1.545455
## 2 0.4285714 3.857143 3.428571
## 3 0.1428571 3.285714 3.500000
linmod <- function(df) {
lm(rbi ~ year, data = mutate(df, year = year - min(year)))
}
models <- dlply(baseball, .(id), linmod)
models[[1]]
##
## Call:
## lm(formula = rbi ~ year, data = mutate(df, year = year - min(year)))
##
## Coefficients:
## (Intercept) year
## 118.924 -1.732
coef <- ldply(models, coef)
with(coef, plot(`(Intercept)`, year))
qual <- laply(models, function(mod) summary(mod)$r.squared)
hist(qual)
3 strip_splits
Remove splitting variables from a data frame.
dlply(mtcars, c("vs", "am"))$'0.1'
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 4 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 5 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 6 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
dlply(mtcars, c("vs", "am"), strip_splits)$'0.1'
## mpg cyl disp hp drat wt qsec gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 4 4
## 3 26.0 4 120.3 91 4.43 2.140 16.70 5 2
## 4 15.8 8 351.0 264 4.22 3.170 14.50 5 4
## 5 19.7 6 145.0 175 3.62 2.770 15.50 5 6
## 6 15.0 8 301.0 335 3.54 3.570 14.60 5 8
4 laply/ ldply/ llply
Split list, apply function, and return results in an array.
laply(baseball, is.factor)
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
ldply(baseball, is.factor)
## .id V1
## 1 id FALSE
## 2 year FALSE
## 3 stint FALSE
## 4 team FALSE
## 5 lg FALSE
## 6 g FALSE
## 7 ab FALSE
## 8 r FALSE
## 9 h FALSE
## 10 X2b FALSE
## 11 X3b FALSE
## 12 hr FALSE
## 13 rbi FALSE
## 14 sb FALSE
## 15 cs FALSE
## 16 bb FALSE
## 17 so FALSE
## 18 ibb FALSE
## 19 hbp FALSE
## 20 sh FALSE
## 21 sf FALSE
## 22 gidp FALSE
colwise(is.factor)(baseball)
## id year stint team lg g ab r h X2b X3b hr
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## rbi sb cs bb so ibb hbp sh sf gidp
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
laply(seq_len(10), identity)
## [1] 1 2 3 4 5 6 7 8 9 10
laply(seq_len(10), rep, times = 4)
## 1 2 3 4
## [1,] 1 1 1 1
## [2,] 2 2 2 2
## [3,] 3 3 3 3
## [4,] 4 4 4 4
## [5,] 5 5 5 5
## [6,] 6 6 6 6
## [7,] 7 7 7 7
## [8,] 8 8 8 8
## [9,] 9 9 9 9
## [10,] 10 10 10 10
laply(seq_len(10), matrix, nrow = 2, ncol = 2)
## , , 1
##
## 1 2
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 6 6
## [7,] 7 7
## [8,] 8 8
## [9,] 9 9
## [10,] 10 10
##
## , , 2
##
## 1 2
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 6 6
## [7,] 7 7
## [8,] 8 8
## [9,] 9 9
## [10,] 10 10
#
llply(mtcars, round)
## $mpg
## [1] 21 21 23 21 19 18 14 24 23 19 18 16 17 15 10 10 15 32 30 34 22 16 15
## [24] 13 19 27 26 30 16 20 15 21
##
## $cyl
## [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4
##
## $disp
## [1] 160 160 108 258 360 225 360 147 141 168 168 276 276 276 472 460 440
## [18] 79 76 71 120 318 304 350 400 79 120 95 351 145 301 121
##
## $hp
## [1] 110 110 93 110 175 105 245 62 95 123 123 180 180 180 205 215 230
## [18] 66 52 65 97 150 150 245 175 66 91 113 264 175 335 109
##
## $drat
## [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 5 4 4 3 3 4 3 4 4 4 4 4 4 4
##
## $wt
## [1] 3 3 2 3 3 3 4 3 3 3 3 4 4 4 5 5 5 2 2 2 2 4 3 4 4 2 2 2 3 3 4 3
##
## $qsec
## [1] 16 17 19 19 17 20 16 20 23 18 19 17 18 18 18 18 17 19 19 20 20 17 17
## [24] 15 17 19 17 17 14 16 15 19
##
## $vs
## [1] 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 0 0 0 1
##
## $am
## [1] 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1
##
## $gear
## [1] 4 4 4 3 3 3 3 4 4 4 4 3 3 3 3 3 3 4 4 4 3 3 3 3 3 4 5 5 5 5 5 4
##
## $carb
## [1] 4 4 1 1 2 1 4 2 2 4 4 3 3 3 4 4 4 1 2 1 1 2 2 4 2 1 2 2 4 6 8 2
llply(llply(mtcars, round), table)
## $mpg
##
## 10 13 14 15 16 17 18 19 20 21 22 23 24 26 27 30 32 34
## 2 1 1 4 3 1 2 3 1 4 1 2 1 1 1 2 1 1
##
## $cyl
##
## 4 6 8
## 11 7 14
##
## $disp
##
## 71 76 79 95 108 120 121 141 145 147 160 168 225 258 276 301 304 318
## 1 1 2 1 1 2 1 1 1 1 2 2 1 1 3 1 1 1
## 350 351 360 400 440 460 472
## 1 1 2 1 1 1 1
##
## $hp
##
## 52 62 65 66 91 93 95 97 105 109 110 113 123 150 175 180 205 215
## 1 1 1 2 1 1 1 1 1 1 3 1 2 2 3 3 1 1
## 230 245 264 335
## 1 2 1 1
##
## $drat
##
## 3 4 5
## 13 18 1
##
## $wt
##
## 2 3 4 5
## 8 13 8 3
##
## $qsec
##
## 14 15 16 17 18 19 20 23
## 1 2 3 9 5 7 4 1
##
## $vs
##
## 0 1
## 18 14
##
## $am
##
## 0 1
## 19 13
##
## $gear
##
## 3 4 5
## 15 12 5
##
## $carb
##
## 1 2 3 4 6 8
## 7 10 3 10 1 1
x <- list(a = 1:10, beta = exp(-3:3), logic = c(TRUE,FALSE,FALSE,TRUE))
llply(x, mean)
## $a
## [1] 5.5
##
## $beta
## [1] 4.535125
##
## $logic
## [1] 0.5
llply(x, quantile, probs = 1:3/4)
## $a
## 25% 50% 75%
## 3.25 5.50 7.75
##
## $beta
## 25% 50% 75%
## 0.2516074 1.0000000 5.0536690
##
## $logic
## 25% 50% 75%
## 0.0 0.5 1.0
5 maply/ mdply/ mlply
Call function with arguments in array or data frame, returning an array.
maply(cbind(mean = 1:5, sd = 1:5), rnorm, n = 2)
## , , = 1
##
## sd
## mean 1 2 3 4 5
## 1 1.801657 NA NA NA NA
## 2 NA 1.106095 NA NA NA
## 3 NA NA 0.8217647 NA NA
## 4 NA NA NA -2.821501 NA
## 5 NA NA NA NA 9.590347
##
## , , = 2
##
## sd
## mean 1 2 3 4 5
## 1 -1.030519 NA NA NA NA
## 2 NA 0.1580219 NA NA NA
## 3 NA NA 5.815589 NA NA
## 4 NA NA NA 6.255045 NA
## 5 NA NA NA NA 6.517154
maply(expand.grid(mean = 1:5, sd = 1:5), rnorm, n = 2)
## , , = 1
##
## sd
## mean 1 2 3 4 5
## 1 1.769694 2.504520 4.549259 1.4664670 3.450159
## 2 1.980856 1.460088 2.646320 0.8999684 6.258953
## 3 4.585556 6.166523 5.006672 -3.8949086 6.607827
## 4 4.880003 6.970657 3.990407 7.7987213 5.337034
## 5 7.484481 5.097291 10.876730 8.8536157 -4.297272
##
## , , = 2
##
## sd
## mean 1 2 3 4 5
## 1 0.1975609 0.9011166 2.1824404 4.822353 5.716088
## 2 2.1209043 2.7668559 1.1869081 -0.226855 3.482352
## 3 3.8654387 6.8412029 4.8279667 -5.854408 9.292712
## 4 2.7303377 4.0600362 0.6514371 -3.852541 11.427749
## 5 4.2586197 5.1160319 6.9453713 6.661454 8.823536
maply(cbind(1:5, 1:5), rnorm, n = 2)
## , , = 1
##
##
## 1 2 3 4 5
## 1 0.8652638 NA NA NA NA
## 2 NA 4.749966 NA NA NA
## 3 NA NA 6.663847 NA NA
## 4 NA NA NA -1.09592 NA
## 5 NA NA NA NA 2.808144
##
## , , = 2
##
##
## 1 2 3 4 5
## 1 2.066592 NA NA NA NA
## 2 NA 0.5606877 NA NA NA
## 3 NA NA 10.46654 NA NA
## 4 NA NA NA 5.488294 NA
## 5 NA NA NA NA 7.01038
#
mdply(cbind(mean = 1:5, sd = 1:5), rnorm, n = 5)
## mean sd V1 V2 V3 V4 V5
## 1 1 1 2.0787665 2.223941 0.4480055 -0.58103212 -0.1337331
## 2 2 2 2.5410636 4.501332 4.5618587 -0.07465344 3.2049112
## 3 3 3 6.7018207 -3.923504 2.7031612 7.52608293 -2.1594058
## 4 4 4 4.9818659 7.629789 9.5450085 -1.41085091 3.8229480
## 5 5 5 -0.3018354 13.875395 5.7283652 5.88079705 4.2506527
mdply(expand.grid(mean = 1:5, sd = 1:5), rnorm, n = 2)
## mean sd V1 V2
## 1 1 1 0.5961004 1.3791960
## 2 2 1 1.6291092 1.1251712
## 3 3 1 3.0887375 2.7994661
## 4 4 1 5.0459975 1.6178327
## 5 5 1 5.1887329 5.2133212
## 6 1 2 7.7679693 1.3047763
## 7 2 2 2.6214345 0.1847116
## 8 3 2 2.2126320 2.9372685
## 9 4 2 3.7124170 0.4576352
## 10 5 2 2.9408175 3.3447097
## 11 1 3 2.1436248 0.7998921
## 12 2 3 1.8270963 6.0001164
## 13 3 3 1.9811834 3.9177688
## 14 4 3 3.7146122 6.9759337
## 15 5 3 6.4166485 5.5713630
## 16 1 4 2.4016234 -1.5215914
## 17 2 4 3.7574388 -1.3499098
## 18 3 4 1.5066352 5.2455257
## 19 4 4 8.2638396 0.4191990
## 20 5 4 2.6317600 9.4836330
## 21 1 5 1.0570083 -2.6440292
## 22 2 5 0.1118640 2.7049722
## 23 3 5 6.5042379 5.8349988
## 24 4 5 5.6982451 9.1249979
## 25 5 5 12.2881738 5.0561882
mdply(cbind(mean = 1:5, sd = 1:5), as.data.frame(rnorm), n = 5)
## mean sd value
## 1 1 1 -0.5532740
## 2 1 1 1.7532357
## 3 1 1 0.0986697
## 4 1 1 0.4944914
## 5 1 1 1.0396215
## 6 2 2 5.0723042
## 7 2 2 0.5392697
## 8 2 2 1.6056064
## 9 2 2 2.5414284
## 10 2 2 3.0733797
## 11 3 3 4.0729889
## 12 3 3 9.1024527
## 13 3 3 8.0094705
## 14 3 3 6.8332345
## 15 3 3 0.2661260
## 16 4 4 6.7809214
## 17 4 4 7.9940996
## 18 4 4 6.3757652
## 19 4 4 4.0488428
## 20 4 4 4.3720060
## 21 5 5 5.8582439
## 22 5 5 11.2575084
## 23 5 5 0.3501260
## 24 5 5 3.6368174
## 25 5 5 9.8025286
#
mlply(cbind(1:4, 4:1), rep)
## $`1`
## [1] 1 1 1 1
##
## $`2`
## [1] 2 2 2
##
## $`3`
## [1] 3 3
##
## $`4`
## [1] 4
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##
## 1 1 4
## 2 2 3
## 3 3 2
## 4 4 1
mlply(cbind(1:4, times = 4:1), rep)
## $`1`
## [1] 1 1 1 1
##
## $`2`
## [1] 2 2 2
##
## $`3`
## [1] 3 3
##
## $`4`
## [1] 4
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
## times
## 1 1 4
## 2 2 3
## 3 3 2
## 4 4 1
mlply(cbind(1:4, 4:1), seq)
## $`1`
## [1] 1 2 3 4
##
## $`2`
## [1] 2 3
##
## $`3`
## [1] 3 2
##
## $`4`
## [1] 4 3 2 1
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
##
## 1 1 4
## 2 2 3
## 3 3 2
## 4 4 1
mlply(cbind(1:4, length = 4:1), seq)
## $`1`
## [1] 1 2 3 4
##
## $`2`
## [1] 2 3 4
##
## $`3`
## [1] 3 4
##
## $`4`
## [1] 4
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
## length
## 1 1 4
## 2 2 3
## 3 3 2
## 4 4 1
mlply(cbind(1:4, by = 4:1), seq, to = 20)
## $`1`
## [1] 1 5 9 13 17
##
## $`2`
## [1] 2 5 8 11 14 17 20
##
## $`3`
## [1] 3 5 7 9 11 13 15 17 19
##
## $`4`
## [1] 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
##
## attr(,"split_type")
## [1] "array"
## attr(,"split_labels")
## by
## 1 1 4
## 2 2 3
## 3 3 2
## 4 4 1
6 raply/ rdply/ rlply
Replicate expression and return results in a array/ data.frame/ list
raply(10, mean(runif(100)))
## [1] 0.4910119 0.5024170 0.5003596 0.4809902 0.4828145 0.5365963 0.5106904
## [8] 0.5027685 0.5109807 0.5196246
raply(10, each(mean, var)(runif(100)))
## mean var
## [1,] 0.5434708 0.08560793
## [2,] 0.4682580 0.07898800
## [3,] 0.5042066 0.08370707
## [4,] 0.4871877 0.09328363
## [5,] 0.5333053 0.08355126
## [6,] 0.5303828 0.08755262
## [7,] 0.5022448 0.09076764
## [8,] 0.5391451 0.07751627
## [9,] 0.5060296 0.08053948
## [10,] 0.5427813 0.10018072
hist(raply(1000, mean(rexp(1000))))
#
rdply(20, each(mean, var)(runif(100)))
## .n mean var
## 1 1 0.5353184 0.07202726
## 2 2 0.4685266 0.07709420
## 3 3 0.5146181 0.07449306
## 4 4 0.5296048 0.08182465
## 5 5 0.4847191 0.09275419
## 6 6 0.5406801 0.08494226
## 7 7 0.5007430 0.08600186
## 8 8 0.4948883 0.07834916
## 9 9 0.4789544 0.09872664
## 10 10 0.5260734 0.08358004
## 11 11 0.5411781 0.08026287
## 12 12 0.4757993 0.07788696
## 13 13 0.5261861 0.08718621
## 14 14 0.4808975 0.08441186
## 15 15 0.5059638 0.08007830
## 16 16 0.5143309 0.09769331
## 17 17 0.4989572 0.08590434
## 18 18 0.4380903 0.08102582
## 19 19 0.4771784 0.09595031
## 20 20 0.4556680 0.06813685
#
mods <- rlply(100, lm(y ~ x, data=data.frame(x=rnorm(100), y=rnorm(100))))
hist(laply(mods, function(x) summary(x)$r.squared))
7 arrange
Order a data frame by its colums.
arrange(mtcars, cyl, disp) #same as mtcars[with(mtcars, order(cyl, disp)), ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 2 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 3 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 4 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 5 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 6 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 9 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 10 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 11 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 12 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 13 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 14 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 15 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 16 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 17 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 18 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 19 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 20 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 21 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 22 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 24 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 25 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 26 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 27 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 28 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 29 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 30 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 31 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 32 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
8 decent
Transform a vector into a format that will be sorted in descending order
desc(1:10)
## [1] -1 -2 -3 -4 -5 -6 -7 -8 -9 -10
desc(factor(letters))
## [1] -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17
## [18] -18 -19 -20 -21 -22 -23 -24 -25 -26
9 colwise
Turn a function that operates on a vector into a function that operates column-wise on a data.frame.
nmissing <- function(x) sum(is.na(x))
# Apply to every column in a data frame
colwise(nmissing)(baseball)
## id year stint team lg g ab r h X2b X3b hr rbi sb cs bb so ibb hbp
## 1 0 0 0 0 0 0 0 0 0 0 0 0 12 250 4525 0 1305 7528 377
## sh sf gidp
## 1 960 7390 5272
10 count
Equivalent to as.data.frame(table(x)), but does not include combinations with zero counts.
count(mtcars,"cyl")
## cyl freq
## 1 4 11
## 2 6 7
## 3 8 14
count(mtcars,"cyl","gear")
## cyl freq
## 1 4 45
## 2 6 27
## 3 8 46
11 each
Aggregate multiple functions into a single function
12 failwith
Modify a function so that it returns a default value when there is an error.
f <- function(x) if (x == 1) stop("Error!") else 1
f(1)
f(2)
safef <- failwith(NULL, f)
safef(1)
safef(2)
13 match_df
Extract matching rows of a data frame
longterm <- subset(count(baseball, "id"), freq > 25)
bb_longterm <- match_df(baseball, longterm)
## Matching on: id
dim(baseball)
## [1] 21699 22
dim(longterm)
## [1] 14 2
dim(bb_longterm)
## [1] 383 22
14 mutate
Mutate a data frame by adding new or replacing existing columns.
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
head(mutate(airquality, Temp = (Temp - 32) / 1.8, ozt = Ozone / Temp))
## Ozone Solar.R Wind Temp Month Day ozt
## 1 41 190 7.4 19.44444 5 1 2.1085714
## 2 36 118 8.0 22.22222 5 2 1.6200000
## 3 12 149 12.6 23.33333 5 3 0.5142857
## 4 18 313 11.5 16.66667 5 4 1.0800000
## 5 NA NA 14.3 13.33333 5 5 NA
## 6 28 NA 14.9 18.88889 5 6 1.4823529
15 join
Join two data frames together.
16 rbind.fill/ rbind.fill.matrix
Combine data.frames by row, filling in missing columns Bind matrices by row, and fill missing columns with NA
rbind.fill(mtcars[c("mpg", "wt")], mtcars[c("wt", "cyl")])
## mpg wt cyl
## 1 21.0 2.620 NA
## 2 21.0 2.875 NA
## 3 22.8 2.320 NA
## 4 21.4 3.215 NA
## 5 18.7 3.440 NA
## 6 18.1 3.460 NA
## 7 14.3 3.570 NA
## 8 24.4 3.190 NA
## 9 22.8 3.150 NA
## 10 19.2 3.440 NA
## 11 17.8 3.440 NA
## 12 16.4 4.070 NA
## 13 17.3 3.730 NA
## 14 15.2 3.780 NA
## 15 10.4 5.250 NA
## 16 10.4 5.424 NA
## 17 14.7 5.345 NA
## 18 32.4 2.200 NA
## 19 30.4 1.615 NA
## 20 33.9 1.835 NA
## 21 21.5 2.465 NA
## 22 15.5 3.520 NA
## 23 15.2 3.435 NA
## 24 13.3 3.840 NA
## 25 19.2 3.845 NA
## 26 27.3 1.935 NA
## 27 26.0 2.140 NA
## 28 30.4 1.513 NA
## 29 15.8 3.170 NA
## 30 19.7 2.770 NA
## 31 15.0 3.570 NA
## 32 21.4 2.780 NA
## 33 NA 2.620 6
## 34 NA 2.875 6
## 35 NA 2.320 4
## 36 NA 3.215 6
## 37 NA 3.440 8
## 38 NA 3.460 6
## 39 NA 3.570 8
## 40 NA 3.190 4
## 41 NA 3.150 4
## 42 NA 3.440 6
## 43 NA 3.440 6
## 44 NA 4.070 8
## 45 NA 3.730 8
## 46 NA 3.780 8
## 47 NA 5.250 8
## 48 NA 5.424 8
## 49 NA 5.345 8
## 50 NA 2.200 4
## 51 NA 1.615 4
## 52 NA 1.835 4
## 53 NA 2.465 4
## 54 NA 3.520 8
## 55 NA 3.435 8
## 56 NA 3.840 8
## 57 NA 3.845 8
## 58 NA 1.935 4
## 59 NA 2.140 4
## 60 NA 1.513 4
## 61 NA 3.170 8
## 62 NA 2.770 6
## 63 NA 3.570 8
## 64 NA 2.780 4
A <- matrix (1:4, 2)
B <- matrix (6:11, 2)
A
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
B
## [,1] [,2] [,3]
## [1,] 6 8 10
## [2,] 7 9 11
rbind.fill.matrix (A, B)
## 1 2 3
## [1,] 1 3 NA
## [2,] 2 4 NA
## [3,] 6 8 10
## [4,] 7 9 11
17 rename
Modify names by name, not position
rename(mtcars, c("disp" = "displ"))
## mpg cyl displ hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
18 summarise
Summarise works in an analagous way to transform, except instead of adding columns to an existing data frame, it creates a new one. This is particularly useful in conjunction with ddply as it makes it easy to perform group-wise summaries.
summarise(baseball, duration = max(year) - min(year),
nteams = length(unique(team)))
## duration nteams
## 1 136 132
head(ddply(baseball, "id", summarise, duration = max(year) - min(year), nteams = length(unique(team))) )
## id duration nteams
## 1 aaronha01 22 3
## 2 abernte02 17 7
## 3 adairje01 12 4
## 4 adamsba01 20 2
## 5 adamsbo03 13 4
## 6 adcocjo01 16 5
19 vaggregate
n <- 17; fac <- factor(rep(1:3, length = n), levels = 1:5)
table(fac)
## fac
## 1 2 3 4 5
## 6 6 5 0 0
vaggregate(1:n, fac, sum)
## [1] 51 57 45 0 0
vaggregate(1:n, fac, sum, .default = NA_integer_)
## [1] 51 57 45 NA NA
vaggregate(1:n, fac, range)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 Inf Inf
## [2,] 16 17 15 -Inf -Inf
vaggregate(1:n, fac, range, .default = c(NA, NA) + 0)
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 2 3 NA NA
## [2,] 16 17 15 NA NA
vaggregate(1:n, fac, quantile)
## [,1] [,2] [,3] [,4] [,5]
## 0% 1.00 2.00 3 NA NA
## 25% 4.75 5.75 6 NA NA
## 50% 8.50 9.50 9 NA NA
## 75% 12.25 13.25 12 NA NA
## 100% 16.00 17.00 15 NA NA