library(profvis)
library(microbenchmark)
library(ggplot2)


# There is a built-in function system.time() that allows to measure the time to execute one or more R statements:
system.time ({
  x <- c()
  for (i in  1:10e4) x <- c( x, i )
})


# --------------------------------------------
# ----- "Growing" arrays and matrices -------
# --------------------------------------------


# Pre-allocating memory for the vector or matrix/dataframe that needs to be created within a for-loop
# improves memory usage and significantly improves the speed for large number iterations:

x <- c()
for (i in  1: 7) {
  x <- c( x, i )
  
  print( paste("Iteration", i, " x: ", paste(x, collapse=", ")))
}

#[1] "Iteration 1  x:  1"
#[1] "Iteration 2  x:  1, 2"
#[1] "Iteration 3  x:  1, 2, 3"
#[1] "Iteration 4  x:  1, 2, 3, 4"
#[1] "Iteration 5  x:  1, 2, 3, 4, 5"
#[1] "Iteration 6  x:  1, 2, 3, 4, 5, 6"
#[1] "Iteration 7  x:  1, 2, 3, 4, 5, 6, 7"

x <- numeric(7)
for (i in  1: 7) {
  x[i] <- i
  
  print( paste("Iteration", i, " x: ", paste(x, collapse=", ")))
}

#[1] "Iteration 1  x:  1, 0, 0, 0, 0, 0, 0"
#[1] "Iteration 2  x:  1, 2, 0, 0, 0, 0, 0"
#[1] "Iteration 3  x:  1, 2, 3, 0, 0, 0, 0"
#[1] "Iteration 4  x:  1, 2, 3, 4, 0, 0, 0"
#[1] "Iteration 5  x:  1, 2, 3, 4, 5, 0, 0"
#[1] "Iteration 6  x:  1, 2, 3, 4, 5, 6, 0"
#[1] "Iteration 7  x:  1, 2, 3, 4, 5, 6, 7"


# similar with matrices
matr <-NULL
system.time( for(i in seq(1e4)) matr <-rbind(matr, 1:10) )
#user  system elapsed 
#4.447   0.012   4.454 

matr <- matrix(NA, nrow=1e4, ncol=10)
system.time( for(i in seq(1e4) ) matr[i,] <- 1:10)
#user  system elapsed 
#0.013   0.000   0.013 


# If the size of the final vector or matrix unknown - initialize with some reasonable upperbound value
# and then remove extra records at the end


# --------------------------------------------
# ---------------- Vectorize -----------------
# --------------------------------------------


# In this example we create random numbers one at a time
slow<-function(){
  
  vals<- as.numeric(10e4)
  for(i in 1:10e4) vals[i] <- rnorm(1)
}
  
# Here we generate random numbers using a single function call:  
fast<-function(){
  vals<-rnorm(10e4)
}

system.time( slow() )

system.time( for(i in 1:10) slow() )
system.time( for(i in 1:10) fast() )


# ----- Use microbenchmark package for very short executions-----

microbenchmark(slow())

microbenchmark(slow(), fast() )
res <- microbenchmark(slow(), fast(), times=10 )
print(res)
boxplot(res)


# *** Calls to functions ***
# Loop: one million calls to rnorm()
# Vectorized: a single call to rnorm()

# *** Result assignment ***
# Loop: One million calls to the assignment method
# Vectorized: a single assignment


# Another example of vectorized approach
x <- rnorm( 10e3, 100,2 )

microbenchmark({
  # Slow method:
  total <- 0
  for(i in seq_along(x)) total <- total + log(x[i])
},
{
  # Faster:
  log_sum <- sum(log(x))
})


# Another example of vectorized approach
vec <- rep(FALSE, 10e4)
x <- rnorm(10e4,0,1)

microbenchmark(
  # Slow method: using if() within a loop 
  "if:" = ( for(i in 1:length(vec)) if(x[i] < 0) vec[i] <- TRUE),

  # Faster method: use ifelse() function
  "ifelse:" = ( vec <-ifelse (x<0, TRUE, FALSE ) )
)


# seq_along(x) function is safer and faster than 1:length(x):
x<- rnorm(1e5)
microbenchmark( 1:length(x), seq_along(x) )
#        expr min  lq mean median  uq  max neval
# 1:length(x) 200 300  414    300 300 7600   100
#seq_along(x) 100 100  170    100 200 3700   100


# In the cases when the input x vector has zero length (as a result of a bug or an incorrect input) , seq_along() will doo the right thing:

x <- NULL

# Using 1:length(), the loop will execute two times!
for ( i in 1:length(x) ) print(i)
#> [1] 1
#> [1] 0

# Using seq_along() the loop will execute 0 times
for ( i in seq_along(x) ) print(i)


#---------------------------------------------
#     Random Numbers and vectorization
#---------------------------------------------

# Random number generation is a relatively slow operation. In this example we create random numbers one at a time:

slow<-function(){
  vals<- as.numeric(1e5)
  for(i in 1:1e5) vals[i] <- rnorm(1)
}

# And here we generate random numbers using a single rnorm function call:

fast<-function(){
  vals<-rnorm(1e5)
}

# Let's use microbenchmark() function to compare these two approaches:

microbenchmark(slow(), fast(), times=10 )
#> Unit: milliseconds
#>    expr        min         lq       mean     median         uq       max neval
#>  slow() 204.492434 205.735406 219.102406 207.380764 210.769643 321.73002    10
#>  fast()   6.222917   6.232032   6.387098   6.238346   6.253368   7.72048    10
#>  cld
#>    b
#>   a

# Similar approach should be used when working with other functions, including sample().

#Note: There is an R package - dqrng - that might be considered for random number generation:
microbenchmark(slow(), fast(), "dqrng"=dqrng::dqrnorm(1e5), times=10)
#> Unit: microseconds
#>    expr        min         lq       mean      median         uq        max
#>  slow() 205924.259 207621.974 208790.460 208738.3230 210418.033 211231.077
#>  fast()   6188.793   6238.763   6250.755   6246.2280   6263.155   6315.276
#>   dqrng    804.661    815.949   9780.719    845.5415    855.673  90288.405
#>  neval cld
#>     10   b
#>     10  a 
#>     10  a


# -------------------------------------------------------------
# ---- Large Matrix operations:  Order matters! ---------------
# ------------------------------------------------------------


# matrix A * matrix B * vector v
A <- matrix(rnorm(10000), nrow = 100)
B <- matrix(rnorm(10000), nrow = 100)
v <- rnorm(100)
microbenchmark(
  A %*% B %*% v, 
  A %*% (B %*% v) )

# A few other optimized functions for matrices:
# - crossprod()
# - tcrossprod()


# ----------------------------------------------------------------------
# ----- Use colMeans rowMeans colSums rowSums instead of apply() -------
# ----------------------------------------------------------------------

#Define a matrix
matr <- matrix(runif(1e4), nrow=100) 

# Compare apply(,1,FUN = mean) with rowMeans()
microbenchmark( 
  apply(matr, 1, mean),
  rowMeans(matr) )


# Compare apply(,2,FUN = mean) with colMeans()
microbenchmark( 
  apply(matr, 2, mean),
  colMeans(matr) )

# Compare apply(,1,FUN = sum) with rowSums()
microbenchmark( 
  apply(matr, 1, sum),
  matr %*% (rep(1,100) ), 
  rowSums(matr) )


#With the dataframe
data <- as.data.frame( matr )
microbenchmark( 
  apply(data, 2, mean),
  lapply(data,mean ),
  vapply(data,mean, numeric(1) ),
  colMeans(data),
  times=2 
)


# ----------------------------------------------------------------------
# ------------- Data Frames VS. Matrix ---------------------------------
# ----------------------------------------------------------------------

# If you are working with all-numeric data, matrix is a faster R object to use than a data.frame

mat <- matrix(rnorm(10e5),ncol=10)
df <- as.data.frame(mat)
microbenchmark(mat[,1], df[,1])


# When using a data.frame it is slightly faster to use a column name
microbenchmark(
  "[50, 3]" = df[50,3],
  "$V3[50]" = df$V3[50]
)

# A dataframe in R is also a list - a list of columns. Each column is stored separately in memory. The most efficient way extracting a column from a dataframe is by using the column name:

microbenchmark(mat[ ,7], df[ ,7], df$V7)
#> Unit: nanoseconds
#>       expr  min     lq    mean median     uq   max neval cld
#>  matr[, 7]  942 1296.0 1494.64 1433.5 1575.5  6616   100  a 
#>    df[, 7] 6715 7574.5 8540.65 8047.5 8533.0 43265   100   b
#>      df$V7  747  938.5 1130.54 1080.0 1230.5  5087   100  a


# Which is faster, mat[7, ] or df[7, ]? 
microbenchmark(mat[7,], df[7,])
#> Unit: microseconds
#>       expr     min       lq      mean   median       uq     max neval cld
#>  matr[7, ]   1.253   2.1650   2.64029   2.7290   2.9725   6.076   100  a 
#>    df[7, ] 560.138 574.3515 590.34775 579.5385 595.5235 664.841   100   b

# Also selecting a row in the dataframe returns a dataframe!


# use data.frame to store the data
d <- function() {
  data.frame(
    col1 = sample(1:10, 30, replace = TRUE)
  )
}

# use matrix to store the data
m <- function() {
  matrix(sample(1:10, 30, replace = TRUE), nrow=30)
}

# compare 2 approaches
microbenchmark(
  df = d(),
  mat = m()
)
# Unit: microseconds
# expr   min    lq    mean median     uq    max neval
#   df 148.7 151.0 181.672 152.55 154.55 2549.0   100
#  mat   7.7   9.1  32.219  10.50  12.40 2132.3   100


# ----------------------------------------------------------------------
# ---------------------------- sort() function -------------------------
# ----------------------------------------------------------------------
x <- runif(10^5)
# if the goal is to find only a few smallest elements than it's more optimal to use partial sort
microbenchmark(
  
  (first10 <- sort(x)[1:10]), 
  (first10 <- sort(x,  partial=1:10) ) 
)


# finding indices of elements which are equal to minimum or maximum value
x <- rep(1:9, each =10000)
microbenchmark(
  (min.loc1 <- which (x == min(x) ) ),
  (min.loc2 <- which.min (x))
)

#  Unit: microseconds
#     expr   min     lq    mean median      uq    max neval
#(min.loc1 <- which(x == min(x))) 339.8 440.95 662.826 455.65 1073.50 1370.2   100
#(min.loc2 <- which.min(x)) 114.5 121.60 174.138 122.10  150.75  507.9   100


# ----------------------------------------------------------------------
# ------------------------ pmax,pmin vs. max,min ------------------------
# ----------------------------------------------------------------------


# ----- Calculating max/min value for each row in a matrix -----
# Especially effective when the number of rows is large and number of columns is low ---
x <- matrix(rnorm(1e7,0,1),nrow=1e6, ncol=10)

system.time(apply(x,1,max))
system.time(do.call(pmax,data.frame(x)))

microbenchmark( 
  apply(x,1,max),
  do.call(pmax,data.frame(x)) , 
  times = 3)
  
# However when the matrix is very wide, a regular "max" function should be used

x <- matrix(rnorm(1e7,0,1),ncol=1e6, nrow=10)

system.time(apply(x,1,max))
system.time(do.call(pmax,data.frame(x)))


# ----------------------------------------------------------------------
# --------- Comparing various approaches to solve a problem  -----------
# ----------------------------------------------------------------------


# Problem:
# Make all elements in vector to me no larger than xmax and no smaller than xmin
trim.ifelse <- function(x, xmin, xmax){
  
  ifelse( x<=xmin, xmin, ifelse(x>=xmax, xmax, x))
}

trim.pmaxmin <- function(x, xmin, xmax){
  
  pmax(pmin(x,xmax), xmin)
}

trim.replace <- function(x, xmin, xmax){
  
  x [x < xmin ] <- xmin
  x [x > xmax ] <- xmax
}

#load Rcpp library
library(Rcpp)
#compile C function
sourceCpp('trim.cpp')

x <- runif(100, -5, 5)
microbenchmark(
  trim.ifelse(x, -3, 3),
  trim.pmaxmin(x, -3, 3), 
  trim.replace(x, -3, 3),
  trim_cpp(x, -3, 3) )


# ----------------------------------------------------------------------
# ------------------- for() loop has [condition] -----------------------
# ----------------------------------------------------------------------
df <- data.frame(age = runif(10^3, min=0, max=100),
                 body.temp = runif(10^3, min=35.5, max = 42),
                 bmi = runif(10^3, min=18, max=32)
                 )

#We want to create another column, that for significantly overweight people performs some analysis


#very slow method
method1 <- function(){
  df$res <- rep(NA,nrow(df))
  for (i in 1:nrow(df)){
    
    if (df$bmi[i] > 28){
      
      #here we calculate something (here we just do some random calculations)
      if (df$age[i] > 18)df$res[i] = mean(rnorm(1000,mean=df$age[i])) else median(rnorm(1000,mean=df$age[i]))
      
    }else {
      df$res = -1
    }
    
  }
}

#faster approach - to include the condition in the loop:
method2 <- function(){
  
  #faster method
  df$res <- rep(NA,nrow(df))
  condition <- df$bmi > 28
  for (i in (1:nrow(df)) [condition ]) {
    
    #here we calculate something (here we just do some random calculations)
    if (df$age[i] > 18)df$res[i] = mean(rnorm(1000,mean=df$age[i])) else median(rnorm(1000,mean=df$age[i]))  
    
  }
  
}
#compare methods
microbenchmark(method1(), method2(), rep=10)


# -----------------------------------------------------
# ------- Unlist (keep the names only if needed)-------
# -----------------------------------------------------

#Create some list
my.list <- split(sample(1000, size = 100000, rep=TRUE), rep(1:10000, times=10))
microbenchmark(
  v <- unlist(my.list), 
  v <- unlist(my.list, use.names = FALSE) )


# -----------------------------------------------------
# ---------------- R code profiling -------------------
# -----------------------------------------------------

x <- matrix(rnorm(10000000), nrow=1000)
x[sample(1:10000000, 1000, replace=FALSE)] <- NA
slowFun = function(x) {
  # initialize res 
  res = NULL
  n = nrow(x)
  
  for (i in 1:n) {
    if (!any(is.na(x[i,]))) res = rbind(res, x[i,])
  }
  apply(res,1,mean)
}

# Start profiler
Rprof ()

  # run code
   x.nomiss <- slowFun(x)
   
# end profiler
Rprof(NULL)


# view results
summaryRprof()


# use profvis() from profvis package to visually see the bottleneck
profvis({  
  res = NULL
  n = nrow(x)
  
  for (i in 1:n) {
    if (!any(is.na(x[i,]))) res = rbind(res, x[i,])
  }
  apply(res,1,mean)
})