# ---------------------------------------# # # # Introduction to R # # Research Computing Services # # # # Katia Bulekova # # # # ---------------------------------------# # R as a scientific calculator 2+3 # addition 2^3 # power log(2) # built-in functions pi # built-in constants #------------------ # # variables # #------------------ # a <- 3 A <- 7 # R is case sensitive - variables a and A are different variables # Avoid using names F and T as those are built-in constants for FALSE and TRUE # There is an R function c(). To avoid confusion it is a good practice to avoid naming # your own variables and functions "c". # The variable name can contain letters, digits, underscores and dots and # start with the letter or dot # The variable name cannot contain dollar sign name <- "Nikola Tesla" # character weight <- 150.6 # numeric age <- 87L # integer male <- TRUE # Boolean (or logical) variable today <- Sys.time() # date and time variable # Check object class: class(age) # There are 4 basic data types: vectors matrices, dataframes, lists #------------------ # # R vectors # #------------------ # # Vector is an array of R objects of the same type: names <- c ("Alex", "Nick", "Mike") print(names) # print result names # printing the name of the variable at the prompt shows the content of the variable # Vectors can be defined in a number of ways: c (2, -7, 5, 3, -1 ) # concatenation 25:75 # range of values seq( from=0, to=3, by=0.5 ) # sequence definition rnorm( 50 ) # returns normally distributed values # systolic blood pressure values SBP <- c(96, 110, 100, 125, 90 ) # diastolic blood pressure DBP <- c(55, 70, 68, 100, 50) # calculate MAP (mean arterial pressure) MAP <- SBP/3 + 2*DBP/3 MAP # R arithmetic operators # +, -, *, / - addition, subtraction, multiplication, division # ^ or ** - exponentiation # %% - modulus # %/% - integer division # R logical (Boolean) operators # %in% - membership # <, <=, ==, >=, >, != - Boolean comparative # # Examples: MAP <= 80 "Mike" %in% names # check if string "Mike" is in vector names #-------------------------------------- # # R vector slicing (sub-setting) # #-------------------------------------- # temp <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) temp[2] # returns second element temp[2:4] # returns second through 4th elements inclusive temp[c(1,3,5)] # returns 1st, 3rd and 5th elements temp[-2] # returns all but 2nd element temp[c(TRUE, FALSE, TRUE, FALSE, FALSE,FALSE, TRUE, FALSE)] # returns 1st, 3rd, and 7th elements #compare each element of the vector with a value temp < 37.0 #return only those elements of the vector that satisfy a specific condition temp[ temp < 37.0 ] #-------------------------------------- # # Vector operations # #-------------------------------------- # which.max(temp) # find the (first)maximum element and return its index which.min(temp) which(temp >= 37.0) # find the location of all the elements that satisfy a specific condition # vector functions: # max(x), min(x), sum(x), prod(), # mean(x), sd(), median(x), range(x) # sort(x), rank(x), order(x) # cumsum(), cumprod(x), cummin(x), cummax(x) # var(x) - simple variance # cor(x,y) - correlation between x and y # duplicated(x), unique(x) # summary() # Exercise: # 1. Create a numeric vector with a few values # 2. Calculate minimum and maximum values # 3. Use summary() function to explore basic statistics for this vector #------------------ # # R help # #------------------ # # Access help file for the R function ?sd help(sd) # Search for help ??"standard deviation" help.search("standard deviation") help.search("analysis of variance") #-------------------------------------------------------- # # Factors # #-------------------------------------------------------- # studentID <- 1:10 grades <- c("excellent", "good", "good", "fair", "excellent", "good", "fair", "fair","good", "good" ) summary(grades) # convert character vector to factor (categorical variable) fgrades <- factor(grades) summary(fgrades) # if we want to specify the order of the levels: fgrades <- factor(grades, levels= c("excellent", "good", "fair") ) summary(fgrades) #-------------------------------------------------------- # # Missing Values # #-------------------------------------------------------- # x <- c(734, 145, NA, 456, NA) # define a numeric vector is.na( x ) # check if the element in the vector is missing which( is.na(x) ) # which elements are missing anyNA( x ) # are there any missing data sum( is.na(x) ) # how many missing data are there x[ !is.na(x) ] # list values excluding missing #x == NA # this does not work ! - missing value cannot be compared to anything # Applying functions to vectors containing missing values: mean(x) # By default many statistical functions will not compute if the data contain missing values # Read help topic for the function ?mean #Perform computation removing the missing data mean(x, na.rm=TRUE) #-----------------------------------------------------------------------# # Read-in data from a file # #-----------------------------------------------------------------------# df <- read.csv( "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv") # You can also use RStudio "Import Dataset" menu (in the right-top pane) # See more examples of reading the data from the various formats in IO.R script #----------------------------------------------------------------------# # Dataframe exploration # #----------------------------------------------------------------------# #Look at the first few records head(df) tail(df) #Get the structure of the data: str(df) #Get the summary of the data: summary(df) # We have 3 columns with the categorical variables in our df dataframe # rank, sex, and discipline # let's convert them to be factor variables: df$rank <- factor( df$rank ) df$sex <- factor( df$sex ) df$discipline <- factor( df$discipline ) # Now let's run the same summary function again: summary(df) #----------------------------------------------------------------------# # Dataframe exploration (continuation) # #----------------------------------------------------------------------# #numeric data exploratory analysis min(df$salary) max(df$salary) range(df$yrs.service) summary(df$salary) #view the data hist(df$salary) #another way to look at the data boxplot( df$salary ) #view qq-plot to see if the data normaly distributed qqnorm(df$salary); qqline(df$salary) #Shapiro-Wilks test of normality shapiro.test(df$salary) #Null-hypothesis - the sample comes from a normal distribution #Alternative hypothesis - the sample does not come from a normal distribution #p-value < 0.05 - reject the Null-hypothesis - data are not normally distributed #categorical data analysis summary(df$rank) summary(df$sex) #------------------------------------------------------------- # # dataframes slicing # #------------------------------------------------------------- # df[3,5] # element, from the 3rd row, 5th column df[3,] # all elements from the 3rd row df[ ,5] # all elements in the 5th column df$sex # accessing elements using variable name df[df$sex == "Male", ] # list all the rows in the dataset for which variable ed is equal to specific value #Create a new dataframe as a subset of the original one disc.B <- df[df$discipline == "B", ] #Alternatively we can use function subset() disc.B <- subset( df, discipline == "B") # Save a new data frame into a file write.csv(disc.B, file="B_discipline.csv", row.names=FALSE, quote=FALSE) #------------------------------------------------------------------ # # Data Analysis (numeric vs. categorical) # #------------------------------------------------------------------ # #We would like to compare salaries of various groups in the dataframe #Let's explore if women get a similar salaries as men do boxplot(salary ~ sex, data = df) mean( df$salary[ df$sex == "Female" ] ) mean( df$salary[ df$sex == "Male" ] ) #calculate mean for each subgroup using tapply tapply( df$salary, df$sex, mean) # Comparing 2 means: Student's t test t.test ( salary ~ sex , data=df ) # The difference in salary for various disciplines in this dataset is even more pronounced boxplot( salary ~ sex + discipline, data=df ) # Let's compare the salaries of # - Assistant Professor, # - Associate Professor, # - Professor boxplot(salary ~ rank, data=df) #There is an easier way to perform the above calculation #for each value of a categorical variable: tapply( df$salary, df$rank, mean) # Analysis Of Variance boxplot( salary ~ rank, data=df ) aov.res <- aov( salary ~ rank, data=df ) summary(aov.res) # Since p-value is < 0.05 we reject the Null hypothesis of equal means # F value = variance of the group means (Mean Square Between) / mean of the within group variances (Mean Squared Error) # If the Null hypothesis is true, # we expect F to be close to 1. large F ratio means that variation among group means # is larger than we expect to see by chance # To compare means between each subgroup - perform Tukey honestly significant difference test TukeyHSD(aov.res) # There is a significant difference between Professor and the other 2 groups, but # we cannot reject the Null hypothesis comparing the mean salaries of Assistant Professor and Associate Professor #----------------------------------------------------------------# # Analysis of 2 numeric variables #----------------------------------------------------------------# plot(salary ~ yrs.service, data = df) # Linear regression: fit linear model lm.fit <- lm (salary ~ yrs.service, data = df) summary(lm.fit) # plot original data together with a fitted line plot(df$yrs.service, df$salary) abline (lm.fit) #predict predict (lm.fit, data.frame(yrs.service=10:20) ) #--------------------------------- # # Statistical Models # #--------------------------------- # # # y ~ x - regression # y ~ x - 1 - regression through the origin # y ~ x + z - multiple regression # y ~ x * z - multiple regression with interaction # ... #---------------------------- # # Clean current R session # #---------------------------- # # check variables in the current session objects() ls() rm(list=ls()) # remove everything from your working environment #get information about R version and the versions of the packages used sessionInfo() #-------------------------------------------------------# # Evaluation Link: # # http://scv.bu.edu/survey/tutorial_evaluation.html # # or # # rcs.bu.edu/eval # #-------------------------------------------------------#