# ---------------------------------------#
#                                        #
#  Introduction to R                     #
#  Research Computing Services           #
#                                        #
#  Katia Bulekova                        #
#                                        #
# ---------------------------------------#

# R as a scientific calculator
2+3    # addition
2^3    # power
log(2) # built-in functions
pi     # built-in constants


#------------------ #
#     variables     #
#------------------ #

a <- 3
A <- 7  # R is case sensitive - variables a and A are different variables

# Avoid using names F and T as those are built-in constants for FALSE and TRUE 
# There is an R function c(). To avoid confusion it is a good practice to avoid naming
# your own variables and functions "c".

# The variable name can contain letters, digits, underscores and dots and 
# start with the letter or dot
# The variable name cannot contain dollar sign

name <- "Nikola Tesla"   # character  
weight <- 150.6          # numeric 
age <- 87L               # integer
male <- TRUE             # Boolean (or logical) variable
today <- Sys.time()      # date and time variable

# Check object class:
class(age)


# There are 4 basic data types: vectors matrices, dataframes, lists

#------------------ #
#   R vectors       #
#------------------ #

# Vector is an array of R objects of the same type:
names <- c ("Alex", "Nick", "Mike")
print(names)  # print result
names         # printing the name of the variable at the prompt shows the content of the variable


# Vectors can be defined in a number of ways:
c (2, -7, 5, 3, -1 )              # concatenation
25:75                             # range of values
seq( from=0, to=3, by=0.5 )       # sequence definition
rnorm( 50 )                       # returns normally distributed values

# systolic blood pressure values
SBP <- c(96, 110, 100, 125, 90 )

# diastolic blood pressure
DBP <- c(55, 70, 68, 100, 50)

# calculate MAP (mean arterial pressure)
MAP <- SBP/3 + 2*DBP/3
MAP

# R arithmetic operators
#              +, -, *, /  - addition, subtraction, multiplication, division
#              ^ or **     - exponentiation
#              %%          - modulus
#              %/%         - integer division

# R logical (Boolean) operators
#              %in%        - membership
#              <, <=, ==, >=, >, !=     - Boolean comparative
#       

# Examples: 
MAP <= 80

"Mike" %in% names  # check if string "Mike" is in vector names

#-------------------------------------- #
#   R vector slicing (sub-setting)      #
#-------------------------------------- #
 
temp <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5)
temp[2]         # returns second element 
temp[2:4]       # returns second through 4th elements inclusive
temp[c(1,3,5)]  # returns 1st, 3rd and 5th elements
temp[-2]        # returns all but 2nd element
temp[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)]   # returns 1st, 3rd, and 7th elements

#compare each element of the vector with a value
temp < 37.0

#return only those elements of the vector that satisfy a specific condition
temp[ temp < 37.0 ]    

#-------------------------------------- #
#             Vector operations         #
#-------------------------------------- #

which.max(temp)  # find the (first)maximum element and return its index
which.min(temp)
which(temp >= 37.0) # find the location of all the elements that satisfy a specific condition


# vector functions:
#               max(x),   min(x),  sum(x),     prod(),
#               mean(x),  sd(),    median(x),  range(x)
#               sort(x),  rank(x),    order(x)
#               cumsum(), cumprod(x), cummin(x), cummax(x)
#               var(x)                            - simple variance
#               cor(x,y)                          - correlation between x and y
#               duplicated(x), unique(x)
#               summary()


# Exercise:

# 1. Create a numeric vector with a few values

# 2. Calculate minimum and maximum values

# 3. Use summary() function to explore basic statistics for this vector


#------------------ #
#   R help          #
#------------------ #

# Access help file for the R function
?sd
help(sd)

# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")


#-------------------------------------------------------- #
#                         Factors                         #
#-------------------------------------------------------- #

studentID <- 1:10
grades <- c("excellent", "good", "good", "fair", "excellent", "good", "fair", "fair","good", "good" )
summary(grades)

# convert character vector to factor (categorical variable)
fgrades <- factor(grades)
summary(fgrades)

# if we want to specify the order of the levels:
fgrades <- factor(grades, levels= c("excellent", "good", "fair") ) 
summary(fgrades)


#-------------------------------------------------------- #
#                  Missing Values                         #
#-------------------------------------------------------- #

x <- c(734, 145, NA, 456, NA)    # define a numeric vector
is.na( x )              # check if the element in the vector is missing
which( is.na(x) )       # which elements are missing
anyNA( x )              # are there any missing data
sum( is.na(x) )         # how many missing data are there
x[ !is.na(x)  ]         # list values excluding missing

#x == NA   # this does not work ! - missing value cannot be compared to anything


# Applying functions to vectors containing missing values:
mean(x)
# By default many statistical functions will not compute if the data contain missing values

# Read help topic for the function
?mean

#Perform computation removing the missing data
mean(x, na.rm=TRUE)


#-----------------------------------------------------------------------#
#                       Read-in data from a file                        #
#-----------------------------------------------------------------------#
df <- read.csv( "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv")

# You can also use RStudio "Import Dataset" menu (in the right-top pane)

# See more examples of reading the data from the various formats in IO.R script


#----------------------------------------------------------------------#
#                      Dataframe exploration                           #
#----------------------------------------------------------------------#

#Look at the first few records
head(df)
tail(df)

#Get the structure of the data:
str(df)

#Get the summary of the data:
summary(df)


# We have 3 columns with the categorical variables in our df dataframe
# rank, sex, and discipline
# let's convert them to be factor variables:
df$rank <- factor( df$rank )
df$sex <- factor( df$sex )
df$discipline <- factor( df$discipline )

# Now let's run the same summary function again:
summary(df)


#----------------------------------------------------------------------#
#                      Dataframe exploration (continuation)            #
#----------------------------------------------------------------------#


#numeric data exploratory analysis
min(df$salary)
max(df$salary)
range(df$yrs.service)
summary(df$salary)

#view the data
hist(df$salary)

#another way to look at the data
boxplot( df$salary )

#view qq-plot to see if the data normaly distributed
qqnorm(df$salary); qqline(df$salary)

#Shapiro-Wilks test of normality
shapiro.test(df$salary)
#Null-hypothesis - the sample comes from a normal distribution
#Alternative hypothesis - the sample does not come from a normal distribution
#p-value < 0.05 - reject the Null-hypothesis - data are not normally distributed

#categorical data analysis
summary(df$rank)
summary(df$sex)

#------------------------------------------------------------- #
#                       dataframes slicing                     #
#------------------------------------------------------------- #

df[3,5]    # element, from the 3rd row, 5th column
df[3,]     # all elements from the 3rd row
df[ ,5]    # all elements in the 5th column 

df$sex  # accessing elements using variable name
df[df$sex == "Male", ]  # list all the rows in the dataset for which variable ed is equal to specific value

#Create a new dataframe as a subset of the original one
disc.B <- df[df$discipline == "B", ] 

#Alternatively we can use function subset()
disc.B <- subset( df, discipline == "B")

# Save a new data frame into a file
write.csv(disc.B, file="B_discipline.csv", row.names=FALSE, quote=FALSE)


#------------------------------------------------------------------ #
#     Data Analysis (numeric vs. categorical)                       #
#------------------------------------------------------------------ #

#We would like to compare salaries of various groups in the dataframe


#Let's explore if women get a similar salaries as men do
boxplot(salary ~ sex, data = df)

mean( df$salary[ df$sex == "Female" ] )
mean( df$salary[ df$sex == "Male" ] )


#calculate mean for each subgroup using tapply
tapply( df$salary,  df$sex,  mean)


# Comparing 2 means: Student's t test
t.test ( salary ~ sex , data=df ) 


# The difference in salary for various disciplines in this dataset is even more pronounced
boxplot( salary ~ sex + discipline, data=df  )

# Let's compare the salaries of 
# - Assistant Professor, 
# - Associate Professor,
# - Professor

boxplot(salary ~ rank, data=df)

#There is an easier way to perform the above calculation 
#for each value of a categorical variable:
tapply( df$salary,  df$rank, mean)


# Analysis Of Variance
boxplot( salary ~ rank, data=df  )
aov.res <- aov( salary ~ rank, data=df )
summary(aov.res)
# Since p-value is < 0.05 we reject the Null hypothesis of equal means
# F value = variance of the group means (Mean Square Between) / mean of the within group variances (Mean Squared Error)
# If the Null hypothesis is true, 
# we expect F to be close to 1.  large F ratio means that variation among group means
# is larger than we expect to see by chance

# To compare means between each subgroup - perform Tukey honestly significant difference test
TukeyHSD(aov.res)
# There is a significant difference between Professor and the other 2 groups, but
# we cannot reject the Null hypothesis comparing the mean salaries of Assistant Professor and Associate Professor


#----------------------------------------------------------------#
#              Analysis of 2 numeric variables
#----------------------------------------------------------------#
plot(salary ~ yrs.service, data = df)

# Linear regression: fit linear model
lm.fit <- lm (salary ~ yrs.service, data = df) 
summary(lm.fit)   


# plot original data together with a fitted line
plot(df$yrs.service, df$salary)
abline (lm.fit)

#predict
predict (lm.fit, data.frame(yrs.service=10:20) )


#--------------------------------- #
#     Statistical Models           #
#--------------------------------- #
#
#  y ~ x     - regression
#  y ~ x - 1 - regression through the origin
#  y ~ x + z - multiple regression
#  y ~ x * z - multiple regression with interaction
#  ...


#---------------------------- #
#  Clean current R session    #
#---------------------------- #

# check variables in the current session
objects()
ls()

rm(list=ls()) # remove everything from your working environment

#get information about R version and the versions of the packages used
sessionInfo()


#-------------------------------------------------------#
#  Evaluation Link:                                     #
#  http://scv.bu.edu/survey/tutorial_evaluation.html    #
#  or                                                   #
#  rcs.bu.edu/eval                                      #
#-------------------------------------------------------#