-
Notifications
You must be signed in to change notification settings - Fork 990
Benchmarks : Grouping
arunsrinivasan edited this page Aug 26, 2014
·
31 revisions
- The input data is randomly ordered. No pre-sort, no indexes, no key.
- 5 simple queries are run similar to what a data analyst might do in practice; i.e., various ad hoc aggregations as the data is explored and investigated.
- Each package is tested separately in its own fresh R session.
- Each query is repeated once more, immediately. This is to isolate cache effects and confirm the first timing. The first and second times are displayed.
- The results are compared and checked allowing for numeric tolerance and column name differences.
Scroll to the bottom to find reproducible code and system info.
## data.table run
$ R --vanilla
require(data.table)
N=2e9; K=100
set.seed(1)
DT <- data.table(
id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
id2 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
id4 = sample(K, N, TRUE), # large groups (int)
id5 = sample(K, N, TRUE), # large groups (int)
id6 = sample(N/K, N, TRUE), # small groups (int)
v1 = sample(5, N, TRUE), # int in range [1,5]
v2 = sample(1e6, N, TRUE), # int in range [1,1e6]
v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
)
cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
system.time( DT[, sum(v1), keyby=id1] )
system.time( DT[, sum(v1), keyby=id1] )
system.time( DT[, sum(v1), keyby="id1,id2"] )
system.time( DT[, sum(v1), keyby="id1,id2"] )
system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
## dplyr run
$ R --vanilla
require(dplyr)
N=2e9; K=100
set.seed(1)
DF <- data.frame(stringsAsFactors=FALSE,
id1 = sample(sprintf("id%03d",1:K), N, TRUE),
id2 = sample(sprintf("id%03d",1:K), N, TRUE),
id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE),
id4 = sample(K, N, TRUE),
id5 = sample(K, N, TRUE),
id6 = sample(N/K, N, TRUE),
v1 = sample(5, N, TRUE),
v2 = sample(1e6, N, TRUE),
v3 = sample(round(runif(100,max=100),4), N, TRUE)
)
cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
system.time( DF %>% group_by(id1) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id1) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id1,id2) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id1,id2) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id3) %>% summarise(sum(v1),mean(v3)) )
system.time( DF %>% group_by(id3) %>% summarise(sum(v1),mean(v3)) )
system.time( DF %>% group_by(id4) %>% summarise_each(funs(mean), vars=7:9) )
system.time( DF %>% group_by(id4) %>% summarise_each(funs(mean), vars=7:9) )
system.time( DF %>% group_by(id6) %>% summarise_each(funs(sum), vars=7:9) )
system.time( DF %>% group_by(id6) %>% summarise_each(funs(sum), vars=7:9) )
$ R
# R version 3.1.1 (2014-07-10) -- "Sock it to Me"
# Copyright (C) 2014 The R Foundation for Statistical Computing
# Platform: x86_64-pc-linux-gnu (64-bit)
> sessionInfo()
# R version 3.1.1 (2014-07-10)
# Platform: x86_64-pc-linux-gnu (64-bit)
#
# locale:
# [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
# [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
# [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
# [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
# [9] LC_ADDRESS=C LC_TELEPHONE=C
# [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
# other attached packages:
# [1] dplyr_0.2 data.table_1.9.2
#
# loaded via a namespace (and not attached):
# [1] assertthat_0.1 parallel_3.1.1 plyr_1.8.1 Rcpp_0.11.2 reshape2_1.4
# [6] stringr_0.6.2 tools_3.1.1
$ lsb_release -a
# No LSB modules are available.
# Distributor ID: Ubuntu
# Description: Ubuntu 14.04 LTS
# Release: 14.04
# Codename: trusty
$ uname -a
# Linux ip-172-31-33-222 3.13.0-29-generic #53-Ubuntu SMP
# Wed Jun 4 21:00:20 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux
$ lscpu
# Architecture: x86_64
# CPU op-mode(s): 32-bit, 64-bit
# Byte Order: Little Endian
# CPU(s): 32
# On-line CPU(s) list: 0-31
# Thread(s) per core: 2
# Core(s) per socket: 8
# Socket(s): 2
# NUMA node(s): 2
# Vendor ID: GenuineIntel
# CPU family: 6
# Model: 62
# Stepping: 4
# CPU MHz: 2494.090
# BogoMIPS: 5049.01
# Hypervisor vendor: Xen
# Virtualization type: full
# L1d cache: 32K
# L1i cache: 32K
# L2 cache: 256K
# L3 cache: 25600K
# NUMA node0 CPU(s): 0-7,16-23
# NUMA node1 CPU(s): 8-15,24-31
$ free -h
# total used free shared buffers cached
# Mem: 240G 2.4G 237G 364K 60M 780M
# -/+ buffers/cache: 1.6G 238G
# Swap: 0B 0B 0B