Skip to content

Benchmarks : Grouping

Matt Dowle edited this page Sep 3, 2014 · 31 revisions
  • The input data is randomly ordered. No pre-sort. No indexes. No key.
  • 5 simple queries are run. Similar to what a data analyst might do in practice; i.e., various ad hoc aggregations as the data is explored and investigated.
  • Each package is tested separately in its own fresh R session.
  • Each query is repeated once more, immediately. This is to isolate cache effects and confirm the first timing. The first and second times are plotted. The total runtime is also displayed.
  • The results are compared and checked allowing for numeric tolerance and column name differences.

Scroll to the bottom to find reproducible code and system info.


Smaller tests : 1E7 1E8 1E9

Code to reproduce the timings above :

## data.table run
$ R --vanilla
require(data.table)
N=2e9; K=100
set.seed(1)
DT <- data.table(
  id1 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
  id2 = sample(sprintf("id%03d",1:K), N, TRUE),      # large groups (char)
  id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # small groups (char)
  id4 = sample(K, N, TRUE),                          # large groups (int)
  id5 = sample(K, N, TRUE),                          # large groups (int)
  id6 = sample(N/K, N, TRUE),                        # small groups (int)
  v1 =  sample(5, N, TRUE),                          # int in range [1,5]
  v2 =  sample(1e6, N, TRUE),                        # int in range [1,1e6]
  v3 =  sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
)
cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
system.time( DT[, sum(v1), keyby=id1] )
system.time( DT[, sum(v1), keyby=id1] )
system.time( DT[, sum(v1), keyby="id1,id2"] )
system.time( DT[, sum(v1), keyby="id1,id2"] )
system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
system.time( DT[, list(sum(v1),mean(v3)), keyby=id3] )
system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
system.time( DT[, lapply(.SD, mean), keyby=id4, .SDcols=7:9] )
system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )
system.time( DT[, lapply(.SD, sum), keyby=id6, .SDcols=7:9] )

## dplyr run
$ R --vanilla
require(dplyr)
N=2e9; K=100
set.seed(1)
DF <- data.frame(stringsAsFactors=FALSE,
  id1 = sample(sprintf("id%03d",1:K), N, TRUE),
  id2 = sample(sprintf("id%03d",1:K), N, TRUE),
  id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE),
  id4 = sample(K, N, TRUE),                          
  id5 = sample(K, N, TRUE),                         
  id6 = sample(N/K, N, TRUE),                       
  v1 =  sample(5, N, TRUE),                         
  v2 =  sample(1e6, N, TRUE),                       
  v3 =  sample(round(runif(100,max=100),4), N, TRUE)
)
cat("GB =", round(sum(gc()[,2])/1024, 3), "\n")
system.time( DF %>% group_by(id1) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id1) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id1,id2) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id1,id2) %>% summarise(sum(v1)) )
system.time( DF %>% group_by(id3) %>% summarise(sum(v1),mean(v3)) )
system.time( DF %>% group_by(id3) %>% summarise(sum(v1),mean(v3)) )
system.time( DF %>% group_by(id4) %>% summarise_each(funs(mean), vars=7:9) )
system.time( DF %>% group_by(id4) %>% summarise_each(funs(mean), vars=7:9) )
system.time( DF %>% group_by(id6) %>% summarise_each(funs(sum), vars=7:9) )
system.time( DF %>% group_by(id6) %>% summarise_each(funs(sum), vars=7:9) )

R info

$ R --vanilla
# R version 3.1.1 (2014-07-10) -- "Sock it to Me"
# Copyright (C) 2014 The R Foundation for Statistical Computing
# Platform: x86_64-pc-linux-gnu (64-bit)

> sessionInfo()
# R version 3.1.1 (2014-07-10)
# Platform: x86_64-pc-linux-gnu (64-bit)
# 
# locale:
#  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
#  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8
#  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8
#  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C
#  [9] LC_ADDRESS=C               LC_TELEPHONE=C
# [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C

# attached base packages:
# [1] stats     graphics  grDevices utils     datasets  methods base

# other attached packages:
# [1] dplyr_0.2        data.table_1.9.2
# 
# loaded via a namespace (and not attached):
# [1] assertthat_0.1 parallel_3.1.1 plyr_1.8.1     Rcpp_0.11.2 reshape2_1.4
# [6] stringr_0.6.2  tools_3.1.1

System info

Amazon EC2 r3.8large

$ lsb_release -a
# No LSB modules are available.
# Distributor ID:    Ubuntu
# Description:    Ubuntu 14.04 LTS
# Release:    14.04
# Codename:    trusty

$ uname -a
# Linux ip-172-31-33-222 3.13.0-29-generic #53-Ubuntu SMP
# Wed Jun 4 21:00:20 UTC 2014 x86_64 x86_64 x86_64 GNU/Linux

$ lscpu
# Architecture:          x86_64
# CPU op-mode(s):        32-bit, 64-bit
# Byte Order:            Little Endian
# CPU(s):                32
# On-line CPU(s) list:   0-31
# Thread(s) per core:    2
# Core(s) per socket:    8
# Socket(s):             2
# NUMA node(s):          2
# Vendor ID:             GenuineIntel
# CPU family:            6
# Model:                 62
# Stepping:              4
# CPU MHz:               2494.090
# BogoMIPS:              5049.01
# Hypervisor vendor:     Xen
# Virtualization type:   full
# L1d cache:             32K
# L1i cache:             32K
# L2 cache:              256K
# L3 cache:              25600K
# NUMA node0 CPU(s):     0-7,16-23
# NUMA node1 CPU(s):     8-15,24-31

$ free -h
#              total       used       free     shared    buffers cached
# Mem:          240G       2.4G       237G       364K        60M 780M
# -/+ buffers/cache:       1.6G       238G
# Swap:           0B         0B         0B

 web analytics