R as a scientific calculator

2 + 3    # addition
## [1] 5
2 ^ 3    # power
## [1] 8
log(2)   # built-in functions
## [1] 0.6931472



Variable (objects) definition

The traditional R way to assign a value to a variable is to use symbol <-, but = can be used too:

a <- 3
b = 2 ^ (2 + 2) 

To view the value of a variable use command print:

print(b)
## [1] 16

You can also simply print the name of the R object at the prompt of the command line:

b
## [1] 16

R is case sensitive:

A <- 7
print(A-a) 
## [1] 4

Avoid using names c, t, cat, F, T, D and other built-in functions and constants

The variable name can contain letters, digits, underscores and dots and start with the letter or dot. The variable name cannot contain dollar signs or other special characters as they will be used to built more complex R structures like lists and data frames. Make your variable names descriptive - use period (.) or underscore (_) to separate words.

There are numeric, character and logical variable types in R:

element <- "Au"   # character variable (single or double quotes are OK)
number <- 79      # numeric variable
metal <- TRUE     # logical variable

Vectors

Vector is an list of R objects of the same type:

seq<- c ("CCTTCCTAC", "GAAATAGT", "ACTGGGGGATACG")  # DNA sequences
len <- c(9, 8, 13)

Vectors can be defined in a veriety of ways. Above we used function c() to combine a few values into a vector. There are many other methods to create a vector in R. Here are a few examples. The outer brackets around the expression forces the result of the operation to be printed:

# define a range of values: 
( ids <- 101:150 )
##  [1] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
## [18] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
## [35] 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
# sequence definition
( steps <- seq(from=0, to=5, by=0.1) )
##  [1] 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1.0 1.1 1.2 1.3 1.4 1.5 1.6
## [18] 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3
## [35] 3.4 3.5 3.6 3.7 3.8 3.9 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
# repeat same value
( score <- rep(1, 10) )
##  [1] 1 1 1 1 1 1 1 1 1 1
# 25 normally distributed values
( random <- rnorm(25) )
##  [1]  0.482339036 -0.163408408  1.826881807  1.894999837  0.296070892
##  [6] -0.713986960  2.045204093 -0.681561082  0.154160563  0.473339896
## [11] -0.733300833 -0.351528399  1.209383331  0.650826816 -0.129238927
## [16] -0.004525267  1.010154821 -1.212763502 -1.251903784  0.059685705
## [21]  0.663583716 -0.415480420  1.340479599 -0.516950330 -0.533291861

The last 3 vectors were defined using R functions.

Getting Help in R

To access help file for the R function with a known name, type question mark with a function name:

?seq
#or
help(seq)

If the name function is not know, use ?? to search R help articles:

??"standard deviation"
#or
help.search("standard deviation")

Vector arithmetic:

R vectors can be added together, subtracted, multiplied, devided etc.:

# systolic blood pressure values
SBP <- c(96, 110, 100, 125, 90 )

# diastolic blood pressure
DBP <- c(55, 70, 68, 100, 50)

# calculate MAP (mean arterial pressure)
MAP <- SBP / 3 + 2 * DBP / 3
print(MAP)
## [1]  68.66667  83.33333  78.66667 108.33333  63.33333

Vector Slicing

temp <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5)    # define a numeric vector
temp[2]         # returns second element 
## [1] 38.2
temp[2:4]       # returns second through 4th elements inclusive
## [1] 38.2 36.4 37.9
temp[c(1,3,5)]  # returns 1st, 3rd and 5th elements
## [1] 36.6 36.4 41.0
temp[-2]        # returns all but 2nd element
## [1] 36.6 36.4 37.9 41.0 39.9 36.8 37.5
temp[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)]   # returns 1st, 3rd, and 7th elements
## [1] 36.6 36.4 36.8
#compare each element of the vector with a value
temp < 37.0
## [1]  TRUE FALSE  TRUE FALSE FALSE FALSE  TRUE FALSE
#return only those elements of the vector that satisfy a specific condition
temp[ temp < 37.0 ]    
## [1] 36.6 36.4 36.8

Basic functions to analyze vector

which.max(temp)  # find the (first)maximum element and return its index
## [1] 5
which.min(temp)
## [1] 3
which(temp >= 37.0) # find the location of all the elements that satisfy a specific condition
## [1] 2 4 5 6 8

There are a number of other useful functions:

max(x), min(x), sum(x)
mean(x), sd(), median(x), range(x)
var(x) - simple variance
cor(x,y) - correlation between 2 vectors
sort(x), rank(x), order(x)
cumsum(), cumprod(x), cummin(x), cumprod(x)
duplicated(x), unique(x)
summary()

Reading Data

df <- read.csv( "http://scv.bu.edu/examples/r/tutorials/Salaries.csv")

All the data from the file are stored in the R object df called data frame.

The following webpage is CRAN official documentation about reading inputs of various formats:
https://cran.r-project.org/doc/manuals/r-release/R-data.html

Explore the data

#numeric data exploratory analysis
min(df$salary)
## [1] 57800
max(df$salary)
## [1] 186960
range(df$yrs.service)
## [1]  0 51
summary(df$salary)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   57800   88612  104671  108024  126775  186960
#view the data
hist(df$salary)

#another way to look at the data
boxplot( df$salary )

#view qq-plot to see if the data normaly distributed
qqnorm(df$salary); qqline(df$salary)

#Shapiro-Wilks test of normality
shapiro.test(df$salary)
## 
##  Shapiro-Wilk normality test
## 
## data:  df$salary
## W = 0.96809, p-value = 0.04743
#Null-hypothesis - the sample comes from a normal distribution
#Alternative hypothesis - the sample does not come from a normal distribution
#p-value < 0.05 - reject the Null-hypothesis - data are not normally distributed


#categorical data analysis
summary(df$rank)
## AssocProf  AsstProf      Prof 
##        13        19        46
summary(df$sex)
## Female   Male 
##     39     39

Dataframe slicing

# dataframe slicing (accessing elements)
df[3,5]    # element, from the 3rd row, 5th column
## [1] Male
## Levels: Female Male
df[3,]     # all elements from the 3rd row
##   rank discipline yrs.since.phd yrs.service  sex salary
## 3 Prof          A            23          20 Male 110515
df[ ,5]    # all elements in the 5th column 
##  [1] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [11] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [21] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [31] Male   Male   Male   Male   Male   Male   Male   Male   Male   Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male
df$sex  # accessing elemnts using variable name
##  [1] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [11] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [21] Male   Male   Male   Male   Male   Male   Male   Male   Male   Male  
## [31] Male   Male   Male   Male   Male   Male   Male   Male   Male   Female
## [41] Female Female Female Female Female Female Female Female Female Female
## [51] Female Female Female Female Female Female Female Female Female Female
## [61] Female Female Female Female Female Female Female Female Female Female
## [71] Female Female Female Female Female Female Female Female
## Levels: Female Male
df[df$sex == "Male", ]  # list all the rows in the dataset for which variable ed is equal to specific value
##         rank discipline yrs.since.phd yrs.service  sex salary
## 1       Prof          B            56          49 Male 186960
## 2       Prof          A            12           6 Male  93000
## 3       Prof          A            23          20 Male 110515
## 4       Prof          A            40          31 Male 131205
## 5       Prof          B            20          18 Male 104800
## 6       Prof          A            20          20 Male 122400
## 7  AssocProf          A            20          17 Male  81285
## 8       Prof          A            18          18 Male 126300
## 9       Prof          A            29          19 Male  94350
## 10      Prof          A            51          51 Male  57800
## 11      Prof          B            39          33 Male 128250
## 12      Prof          B            23          23 Male 134778
## 13  AsstProf          B             1           0 Male  88000
## 14      Prof          B            35          33 Male 162200
## 15      Prof          B            25          19 Male 153750
## 16      Prof          B            17           3 Male 150480
## 17  AsstProf          B             8           3 Male  75044
## 18  AsstProf          B             4           0 Male  92000
## 19      Prof          A            19           7 Male 107300
## 20      Prof          A            29          27 Male 150500
## 21  AsstProf          B             4           4 Male  92000
## 22      Prof          A            33          30 Male 103106
## 23  AsstProf          A             4           2 Male  73000
## 24  AsstProf          A             2           0 Male  85000
## 25      Prof          A            30          23 Male  91100
## 26      Prof          B            35          31 Male  99418
## 27      Prof          A            38          19 Male 148750
## 28      Prof          A            45          43 Male 155865
## 29  AsstProf          B             7           2 Male  91300
## 30      Prof          B            21          20 Male 123683
## 31 AssocProf          B             9           7 Male 107008
## 32      Prof          B            22          21 Male 155750
## 33      Prof          A            27          19 Male 103275
## 34      Prof          B            18          18 Male 120000
## 35 AssocProf          B            12           8 Male 119800
## 36      Prof          B            28          23 Male 126933
## 37      Prof          B            45          45 Male 146856
## 38      Prof          A            20           8 Male 102000
## 39  AsstProf          B             4           3 Male  91000
#Create a new dataframe as a subset of the original one
disc.B <- df[df$discipline == "B", ] 
head(disc.B)
##        rank discipline yrs.since.phd yrs.service  sex salary
## 1      Prof          B            56          49 Male 186960
## 5      Prof          B            20          18 Male 104800
## 11     Prof          B            39          33 Male 128250
## 12     Prof          B            23          23 Male 134778
## 13 AsstProf          B             1           0 Male  88000
## 14     Prof          B            35          33 Male 162200

data analysis

We would like to complare to groups of the dataframe. Let’s compare the salaries of * Assistant Professor, * Associate Professor, * Professor

boxplot(salary ~ rank, data=df)

#get mean value for each subgroup
mean( df$salary[ df$rank == "AsstProf" ] )
## [1] 81362.79
mean( df$salary[ df$rank == "AssocProf" ] )
## [1] 91786.23
mean( df$salary[ df$rank == "Prof" ] )
## [1] 123624.8
#There is an easier way to perform the above calculation for each value of a categorical variable:
tapply( df$salary,  df$rank, mean)
## AssocProf  AsstProf      Prof 
##  91786.23  81362.79 123624.80

Is this difference is actually significant to state that women’s salary is lower than men’s We will perform the analysis of variance (anova test)

aov.res <- aov( salary ~ sex, data=df )
summary(aov.res)
##             Df    Sum Sq   Mean Sq F value Pr(>F)  
## sex          1 3.845e+09 3.845e+09   5.057 0.0274 *
## Residuals   76 5.780e+10 7.605e+08                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Since p-value is < 0.05 we reject the Null hypothesis of equal means F - is the ration of two mean square values. If the Null hypothesis is true, we expect F to be close to 1. large F ratio means that variation among group means is larger than we expect to see by chance



The difference in salary for various disciplines in this dataset is even more pronounced:

boxplot( salary ~ sex + discipline, data=df  )

aov.res <- aov( salary ~ sex + discipline, data=df )
summary(aov.res)
##             Df    Sum Sq   Mean Sq F value  Pr(>F)   
## sex          1 3.845e+09 3.845e+09   5.598 0.02056 * 
## discipline   1 6.281e+09 6.281e+09   9.145 0.00341 **
## Residuals   75 5.151e+10 6.869e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Multiple Comparisions

boxplot( salary ~ rank, data=df  )

plot.design( salary ~ rank, data=df )

aov.res <- aov( salary ~ rank, data=df )
summary(aov.res)
##             Df    Sum Sq   Mean Sq F value   Pr(>F)    
## rank         2 2.813e+10 1.406e+10   31.48 1.19e-10 ***
## Residuals   75 3.351e+10 4.468e+08                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here we definetly reject the Null hypothesis, but we cannot state that all groups have different means

To compare means between each subgroup - perform Tukey honestly significant difference test

TukeyHSD(aov.res)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = salary ~ rank, data = df)
## 
## $rank
##                         diff       lwr       upr     p adj
## AsstProf-AssocProf -10423.44 -28616.09  7769.212 0.3617325
## Prof-AssocProf      31838.57  15962.43 47714.715 0.0000238
## Prof-AsstProf       42262.01  28478.17 56045.864 0.0000000

There is a significant difference between Professor and the other 2 groups, but we cannot reject a Null hypothesis comparing the mean salaries of Assistant Professor and Associate Professor

Analysis of 2 numeric variables

plot(df$yrs.service, df$yrs.since.phd)

# Linear regression: fit linear model
lm.fit <- lm (yrs.since.phd ~ yrs.service, data = df) 
summary(lm.fit)   
## 
## Call:
## lm(formula = yrs.since.phd ~ yrs.service, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.520 -3.293 -1.361  2.116 15.980 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.33774    0.85337   6.255 2.14e-08 ***
## yrs.service  0.95456    0.04424  21.575  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.713 on 76 degrees of freedom
## Multiple R-squared:  0.8596, Adjusted R-squared:  0.8578 
## F-statistic: 465.5 on 1 and 76 DF,  p-value: < 2.2e-16
coef(lm.fit)   # access coefficients
## (Intercept) yrs.service 
##   5.3377387   0.9545625
resid(lm.fit)  # access residual errors
##           1           2           3           4           5           6 
##  3.88869858  0.93488625 -1.42898880  5.07082365 -2.51986379 -4.42898880 
##           7           8           9          10          11          12 
## -1.56530129 -4.51986379  5.52557370 -3.02042642  2.16169865 -4.29267631 
##          13          14          15          16          17          18 
## -4.33773872 -1.83830135  1.52557370  8.79857377 -0.20142623 -1.33773872 
##          19          20          21          22          23          24 
##  6.98032375 -2.11092633 -5.15598874 -0.97461384 -3.24686373 -3.33773872 
##          25          26          27          28          29          30 
##  2.70732369  0.07082365 14.52557370 -1.38392639 -0.24686373 -3.42898880 
##          31          32          33          34          35          36 
## -3.01967625 -3.38355131  3.52557370 -4.51986379 -0.97423875  0.70732369 
##          37          38          39          40          41          42 
## -3.29305140  7.02576125 -4.20142623 -4.51986379 -0.70198886  0.02576125 
##          43          44          45          46          47          48 
## -3.24686373 -0.33773872 -0.47442630 -4.20180132  2.79857377 -4.83792627 
##          49          50          51          52          53          54 
## -4.56530129 -5.51986379 -0.11055124  1.29838622  6.66226128 -3.20142623 
##          55          56          57          58          59          60 
## -1.33811381 -3.33773872 -2.97423875 -3.29230123  5.84363617 -2.88336376 
##          61          62          63          64          65          66 
## -5.20142623 -1.88336376  1.98032375 -2.11092633 -2.24723882 12.52557370 
##          67          68          69          70          71          72 
## -4.06511375  1.16207373 -3.24686373 15.98032375 -0.20142623 -1.92880126 
##          73          74          75          76          77          78 
##  4.34382372  3.11663624  7.93488625 -4.56530129  9.29838622  3.34382372
fitted(lm.fit) # predicted values for dt$ap
##         1         2         3         4         5         6         7 
## 52.111301 11.065114 24.428989 34.929176 22.519864 24.428989 21.565301 
##         8         9        10        11        12        13        14 
## 22.519864 23.474426 54.020426 36.838301 27.292676  5.337739 36.838301 
##        15        16        17        18        19        20        21 
## 23.474426  8.201426  8.201426  5.337739 12.019676 31.110926  9.155989 
##        22        23        24        25        26        27        28 
## 33.974614  7.246864  5.337739 27.292676 34.929176 23.474426 46.383926 
##        29        30        31        32        33        34        35 
##  7.246864 24.428989 12.019676 25.383551 23.474426 22.519864 12.974239 
##        36        37        38        39        40        41        42 
## 27.292676 48.293051 12.974239  8.201426 22.519864 39.701989 12.974239 
##        43        44        45        46        47        48        49 
##  7.246864  5.337739 23.474426 29.201801  8.201426 15.837926 21.565301 
##        50        51        52        53        54        55        56 
## 22.519864 10.110551 18.701614  5.337739  8.201426 26.338114  5.337739 
##        57        58        59        60        61        62        63 
## 12.974239  6.292301 30.156364 14.883364  8.201426 14.883364 12.019676 
##        64        65        66        67        68        69        70 
## 31.110926 28.247239 23.474426 11.065114 15.837926  7.246864 12.019676 
##        71        72        73        74        75        76        77 
##  8.201426 13.928801 19.656176 14.883364 11.065114 21.565301 18.701614 
##        78 
## 19.656176
abline (lm.fit)

#predict
predict (lm.fit, data.frame(yrs.service=10:20) )
##        1        2        3        4        5        6        7        8 
## 14.88336 15.83793 16.79249 17.74705 18.70161 19.65618 20.61074 21.56530 
##        9       10       11 
## 22.51986 23.47443 24.42899

Missing values in R

x <- c(734, 145, NA, 456, NA)    # define a numeric vector
is.na(x)              # check if the element in the vector is missing
## [1] FALSE FALSE  TRUE FALSE  TRUE
which(is.na(x))       # which elements are missing
## [1] 3 5
anyNA(x)              # are there any missing data
## [1] TRUE

x == NA does not work! - missing value cannot be compared to anything

# By default mamy statistical functions will not compute if the data contain missing values
mean(x)
## [1] NA
# Read help topic for the function
#?mean

#Perform computation removing the missing data
mean(x, na.rm=TRUE)
## [1] 445