#### Optimising R code
#### Big Data Analysis for Social Scientists - 2015
#### Robert Ackland & Timothy Graham
install.packages("doParallel")
library(doParallel)
# Some general tips
# (1) only read data from disk at the beginning of code (to load)
# and at the very end (to save results).
# (2) Avoid using 'for' loops where possible. Better to use the 'apply' family of functions.
# R works best with matrices, vectors, and tables
# (3) R only uses one core per instance, even if more are available.
# To use more than one core, the user must explicitly do this with functions.
# Measuring how fast your code runs.
# Wrap all your code this way:
system.time({
# creating a vector of 100 million random values
# from the normal distribution
data <- rnorm(10^8)
})[[3]]
# how much memory is being used by our new data object
# (1024 bytes in kb, 1024 kb in mb)
object.size(data) / (1024 * 1024)
# how do we free up the memory again?
rm(data)
# how do we erase all objects from memory
rm(list-ls())
# If you are having problems, you can also run
# 'garbage collection' to tell R to return memory
# back to the operating system (shouldn't have to do this though).
# It basically 'frees up' memory for objects you no longer
# have access to in the global environment (because you used rm function)
gc()
## The importance of pre-declaring variables
# not pre-declaring length of a
cat("\nNot pre-declaring:\n")
n <- 50000
a <- NULL
system.time({
for (i in 1:n) a=c(a,i^2)
})[[3]]
# pre-declaring length of a
cat("\nPre-declaring:\n")
n <- 50000
a <- vector(length=n)
system.time({
for (i in 1:n) a[i]=i^2
})[[3]]
# non-optimised
n <- seq(1,20000,1000)
a <- NULL
nonOptimisedRuntimes <- c(rep(0),length(n))
for (i in 1:length(n)) {
ptm <- proc.time()
for (j in 1:n[i]) a=c(a,i^2)
nonOptimisedRuntimes[i] <- proc.time()[[3]] - ptm[[3]]
}
# optimised
n <- seq(1,20000,1000)
a <- vector(length=max(n))
OptimisedRuntimes <- c(rep(0),length(n))
for (i in 1:length(n)) {
ptm <- proc.time()
for (j in 1:n[i]) a[i]=i^2
OptimisedRuntimes[i] <- proc.time()[[3]] - ptm[[3]]
}
plot(OptimisedRuntimes)
plot(nonOptimisedRuntimes)
# Mistake: performing the same calculation over and over in a loop
n <- 10000000
system.time({
a=vector(length=n)
for (i in 1:n) a[i]=2*pi*sin(i)
})
# use this instead
a=vector(length=n)
system.time({
for (i in 1:n) a[i]=sin(i)
a <- 2*pi*a
})
# even better to avoid loops altogether!
# alternative code without loop:
system.time({
a <- 2*pi*sin(1:n)
})
### The *apply family of functions, or,
### how I learned to avoid loops and love apply functions
## The apply function
# Definition: "Returns a vector or array or list of values obtained by
# applying a function to margins of an array or matrix"
# create a matrix of 10 rows x 2 columns
m <- matrix(c(1:10, 11:20), nrow = 10, ncol = 2)
# mean of the rows
apply(m, 1, mean)
# mean of the columns
apply(m, 2, mean)
# divide all values by 2
apply(m, 1:2, function(x) x/2)
## The lapply function
# definition: lapply returns a list of the same length as X, each element of
# which is the result of applying FUN to the corresponding element of X.
# create a list with 2 elements
l <- list(a = 1:10, b = 11:20)
# the mean of the values in each element
lapply(l, mean)
# the sum of the values in each element
lapply(l, sum)
## The sapply function
# Definition: sapply is a user-friendly version of lapply
# by default returning a vector or matrix if appropriate
# create a list with 2 elements
l <- list(a = 1:10, b = 11:20)
# mean of values using sapply
l.mean <- sapply(l, mean)
# what type of object was returned?
class(l.mean)
###### Parallel computing with R
# parallel computing with "doParallel" package
# based on this: http://cran.r-project.org/web/packages/doParallel/vignettes/gettingstartedParallel.pdf
library(doParallel)
cl <- makeCluster(2) # 2 nodes
registerDoParallel(cl) # register cluster
# One good example is bootstrapping. Letâ€™s see how long it takes to run
# 10,000 bootstrap iterations in parallel on 2 cores
x <- iris[which(iris[,5] != "setosa"), c(1,5)]
trials <- 10000
ptime <- system.time({
r <- foreach(icount(trials), .combine=cbind) %dopar% {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
coefficients(result1)
}
})[3]
ptime
# By changing the %dopar% to %do%, we can run the same code sequentially to
# determine the performance improvement:
x <- iris[which(iris[,5] != "setosa"), c(1,5)]
trials <- 2500
ptime <- system.time({
r <- foreach(icount(trials), .combine=cbind) %do% {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x[ind,2]~x[ind,1], family=binomial(logit))
coefficients(result1)
}
})[3]
ptime
# stop the cluster (IMPORTANT!)
stopCluster(cl)