library(ggplot2) mv <- as.numeric(-5000:5000) # vector of means sds <- rep(0.1, 10001) # vector of sd vals<-vector(mode="numeric",length(10001)) for(i in 1:10001){ vals[i] <- rnorm(1, mv[i],sds[i]) } system.time( { mv <- as.numeric(-5000:5000) # vector of means sds <- rep(0.1, 10001) # vector of sd vals<-vector(mode="numeric",length(10001)) for(i in 1:10001){ vals[i] <- rnorm(1, mv[i],sds[i]) } }) # ----- Vectorize ----------------- slow<-function(){ mv <- as.numeric(-5000:5000) # vector of means sds <- rep(0.1, 10001) # vector of sd vals<-vector(mode="numeric",length(10001)) for(i in 1:10001){ vals[i] <- rnorm(1, mv[i],sds[i]) } } fast<-function(){ mv <- as.numeric(-5000:5000) # vector of means sds <- rep(0.1, 10001) # vector of sd vals<-rnorm(10001, mv, sds) } system.time( for(i in 1:10)slow() ) system.time( for(i in 1:10)fast() ) # ----- Use microbenchmark package ----- library(microbenchmark) microbenchmark(slow(), fast() ) res <- microbenchmark(slow(), fast(), times=10 ) print(res) boxplot(res) # ----- Order matters! ------- # matrix A * matrix B * vector v A <- matrix(rnorm(10000), nrow = 100) B <- matrix(rnorm(10000), nrow = 100) v <- rnorm(100) microbenchmark( A %*% B %*% v, A %*% (B %*% v) ) # ----- A few other optimizied functions for matricies----- A <- matrix(rnorm(10000), nrow = 100) B <- matrix(rnorm(10000), nrow = 100) v <- rnorm(100) w <- rnorm(100) microbenchmark( t(A) %*% A, crossprod(A) ) microbenchmark( t(A) %*% v, crossprod(A,v) ) microbenchmark( v %*% t(w), tcrossprod(v,w) ) # ----- Use colMeans rowMeans colSums rowSums instead of apply() ------- #Define a matrix matr <- matrix(runif(10000),nrow=100) # Compare apply(,1,FUN = mean) with rowMeans() microbenchmark( apply(matr, 1, mean), rowMeans(matr) ) # Compare apply(,2,FUN = mean) with colMeans() microbenchmark( apply(matr, 2, mean), colMeans(matr) ) # Compare apply(,1,FUN = sum) with rowSums() microbenchmark( apply(matr, 1, sum), matr %*% (rep(1,100) ), rowSums(matr) ) # Compare apply(,2,FUN = mean) with colMeans() matr <- matrix(runif(10000000),nrow=100) microbenchmark( apply(matr, 2, mean), colMeans(matr), times=2 ) #With the dataframe data <- as.data.frame( matr ) microbenchmark( apply(data, 2, mean), lapply(data,mean ), vapply(data,mean, numeric(1) ), colMeans(data), times=2 ) library(profvis) profvis({ apply(data, 2, mean) lapply(data,mean ) vapply(data,mean, numeric(1) ) colMeans(data) }) profvis({ apply(data, 2, sum) lapply(data,sum ) vapply(data,sum, numeric(1) ) colSums(data) }) # --------- sort() function -------- x <- runif(10^6) # if the goal is to find only a few smallest elements than it's more optimal to use partial sort microbenchmark( (first10 <- sort(x)[1:10]), (first10 <- sort(x, partial=1:10) ) ) # finding indecies of elements which are equal to minimum or maximum value x <- rep(1:9, each =10000) microbenchmark( (min.loc1 <- which (x == min(x) ) ), (min.loc2 <- which.min (x)) ) # --------- Use ifelse () instead of if() working with arrays -------- vec <- rep(FALSE, 100000) x <- rnorm(100000,0,1) system.time( for(i in 1:length(vec)) if(x[i] < 0)vec[i] <- TRUE) system.time( vec <-ifelse (x<0, TRUE, FALSE) ) # ----- Calculating max/min value for each row in a matrix ----- # Especially effective when the number of rows is large and number of columns is low --- x <- matrix(rnorm(1e7,0,1),ncol=1e6,nrow=10) system.time(apply(x,1,max)) system.time(do.call(pmax,data.frame(x))) microbenchmark( apply(x,1,max), do.call(pmax,data.frame(x)) , times = 3) # ----- ifelse, pmax, pmin and simple assignment ----- # Make all elements in vector to me no larger than xmax and no smaller than xmin trim.ifelse <- function(x, xmin, xmax){ ifelse( x<=xmin, xmin, ifelse(x>=xmax, xmax, x)) } trim.pmaxmin <- function(x, xmin, xmax){ pmax(pmin(x,xmax), xmin) } trim.replace <- function(x, xmin, xmax){ x [x < xmin ] <- xmin x [x > xmax ] <- xmax } #load Rcpp library library(Rcpp) #compile C function sourceCpp('trim.cpp') x <- runif(100, -5, 5) microbenchmark( trim.ifelse(x, -3, 3), trim.pmaxmin(x, -3, 3), trim.replace(x, -3, 3), trim_cpp(x, -3, 3) ) df <- data.frame(age = runif(10^3, min=0, max=100), body.temp = runif(10^3, min=35.5, max = 42), bmi = runif(10^3, min=18, max=32) ) #We want to create another column, that for significantly overweight people performs some analysis #very slow method method1 <- function(){ df$res <- rep(NA,nrow(df)) for (i in 1:nrow(df)){ if (df$bmi[i] > 28){ #here we calculate something (here we just do some random calculations) if (df$age[i] > 18)df$res[i] = mean(rnorm(1000,mean=df$age[i])) else median(rnorm(1000,mean=df$age[i])) }else { df$res = -1 } } } #faster approach - to include the condition in the loop: method2 <- function(){ #faster method df$res <- rep(NA,nrow(df)) condition <- df$bmi > 28 for (i in (1:nrow(df)) [condition ]) { #here we calculate something (here we just do some random calculations) if (df$age[i] > 18)df$res[i] = mean(rnorm(1000,mean=df$age[i])) else median(rnorm(1000,mean=df$age[i])) } } #compare methods microbenchmark(method1(), method2(), rep=10) # -------------------------------------------- # ----- "Growing" arrays and matricies ------- # -------------------------------------------- # Pre-allocating memory for the vector or matrix/dataframe that needs to be created within a for-loop # improves memory usage and significantly improves the speed for large number iterations: x <- c() for (i in 1: 7) { x <- c( x, i ) print( paste("Iteration", i, " x: ", paste(x, collapse=", "))) } #[1] "Iteration 1 x: 1" #[1] "Iteration 2 x: 1, 2" #[1] "Iteration 3 x: 1, 2, 3" #[1] "Iteration 4 x: 1, 2, 3, 4" #[1] "Iteration 5 x: 1, 2, 3, 4, 5" #[1] "Iteration 6 x: 1, 2, 3, 4, 5, 6" #[1] "Iteration 7 x: 1, 2, 3, 4, 5, 6, 7" x <- numeric(7) for (i in 1: 7) { x[i] <- i print( paste("Iteration", i, " x: ", paste(x, collapse=", "))) } #[1] "Iteration 1 x: 1, 0, 0, 0, 0, 0, 0" #[1] "Iteration 2 x: 1, 2, 0, 0, 0, 0, 0" #[1] "Iteration 3 x: 1, 2, 3, 0, 0, 0, 0" #[1] "Iteration 4 x: 1, 2, 3, 4, 0, 0, 0" #[1] "Iteration 5 x: 1, 2, 3, 4, 5, 0, 0" #[1] "Iteration 6 x: 1, 2, 3, 4, 5, 6, 0" #[1] "Iteration 7 x: 1, 2, 3, 4, 5, 6, 7" x <- c() system.time( for(i in seq(1e5) ) x <- c( x, i ) ) #most inefficient way to calculate an array #user system elapsed #13.075 0.104 13.167 x <- numeric(1e5) system.time( for(i in seq(1e5) ) x[i] <- i ) # much more efficient #user system elapsed #0.089 0.003 0.092 system.time( vec <- 1:100000 ) # most efficient way for this example # similar with matricies matr <-NULL system.time( for(i in seq(1e4)) matr <-rbind(matr, 1:10) ) #user system elapsed #4.447 0.012 4.454 matr <- matrix(NA, nrow=1e4, ncol=10) system.time( for(i in seq(1e4) ) matr[i,] <- 1:10) #user system elapsed #0.013 0.000 0.013 # If the size of the final vector or matrix unknown - initialize with some reasonable upperbound value # and then remove extra records at the end # ----------------------------------------------------- # ------- Unlist (keep the names only if needed)------- # ----------------------------------------------------- #Create some list my.list <- split(sample(1000, size = 100000, rep=TRUE), rep(1:10000, times=10)) microbenchmark( v <- unlist(my.list), v <- unlist(my.list, use.names = FALSE) ) # # ----------- Reading Large files ----------------- # #Create some large dataframe # MakeRandomString <- function(n, len = 10){ # v <- vector(mode="character", len = n) # for (i in 1:n){ # v[i] <- paste(sample(letters, size = len, replace = TRUE), collapse="") # } # return(v) # } # # N <- 1000000 # dt <- data.frame(V1 = MakeRandomString(N, len=25), # V2 = MakeRandomString(N, len=15), # V3 = rnorm(N), # V4 = rnorm(N)) # # #Output into the file # write.csv(dt, file="SampleFile.csv", quote = FALSE, row.names = FALSE) system.time(dt1<- read.csv("SampleFile.csv")) library(data.table) system.time(dt2 <- fread("SampleFile.csv", data.table = FALSE)) # ------- Extracting a value from the dataframe ------- microbenchmark( "[50, 3]" = dt1[50,3], "$V3[50]" = dt1$V3[50], "subset2" = .subset2(dt1,3)[50] ) microbenchmark( "[, 3]" = dt1[,3], "$V3" = dt1$V3, "subset2" = .subset2(dt1,3) ) # ------- R code profiling ------- x <- matrix(rnorm(10000000), nrow=1000) x[sample(1:10000000, 1000, replace=FALSE)] <- NA slowFun = function(x) { # initialize res res = NULL n = nrow(x) for (i in 1:n) { if (!any(is.na(x[i,]))) res = rbind(res, x[i,]) } apply(res,1,mean) } Rprof () x.nomiss <- slowFun(x) Rprof(NULL) summaryRprof() Rprof (tmp <- tempfile()) x.nomiss <- slowFun(x) Rprof(NULL) summaryRprof(tmp) library(proftools) plotProfileCallGraph( readProfileData(tmp), score = "total") fasterFun = function(x) { # initialize res res = NULL n = nrow(x) res = x # initialize res with the size of x k = 0 for (i in 1:n) { if (!any(is.na(x[i,]))) { res[k, ] = x[i,] k = k + 1 } } rowMeans(res[1:k,]) } Rprof (tmp <- tempfile()) x.nomiss <- fasterFun(x) Rprof(NULL) summaryRprof(tmp) library(proftools) plotProfileCallGraph( readProfileData(tmp), score = "total") # TO add: # x[x>0] vs x[which(x>0)] for large vectors