# R as a scientific calculator 2+3 # addition 2^3 # power log(2) # built-in functions #------------------ # # variables # #------------------ # a <- 3 A <- 7 # R is case sensitive # Avoid using names c, t, cat, F, T, D as those are built-in functions/constants # The variable name can contain letters, digits, underscores and dots and # start with the letter or dot # The variable name cannot contain dollar sign str.var <- "oxygen" # character variable num.var <- 15.99 # numerical variable bool.var <- TRUE # boolean (or logical) variable bool.var.1 <- F # logical values can be abbriviated rna.str <- "TTAGAATT" # string variable mode(rna.str) # get the type (or mode) of the object nchar(rna.str) # get the lenght of the string #------------------ # # R help # #------------------ # # Access help file for the R function ?sd help(sd) # Search for help ??"standard deviation" help.search("standard deviation") help.search("analysis of variance") #------------------ # # R vectors # #------------------ # # Vector is an array of R objects of the same type: num.vector <- c( 5,7, 9,11, 4, -1, 0) # Numeric vectors can be defined in a number of ways: vals <- c (2, -7, 5, 3, -1 ) # concatenation vals <- 25:75 # range of values vals <- seq(from=0, to=3, by=0.5) # sequence definition vals <- rnorm(5, mean=2, sd=1.5 ) # returns normally distributed values # Vector arithmetic: # systolic blood pressure values SBP <- c(96, 110, 100, 125, 90 ) # diastolic blood pressure DBP <- c(55, 70, 68, 100, 50) # calculate MAP (mean arterial pressure) MAP <- SBP / 3 + 2 * DBP / 3 print(MAP) # R operators # +, -, *, / - addition, subtraction, multiplication, division # ^ or ** - exponentiation # %in% - membership # : - sequence genration # <, <=, ==, >=, >, != - boolean comparative # |, || - OR vectorized/not vectorized # &, && - AND # # vector slicing (subsetting) x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5) # define a numeric vector x[2] # returns second element x[2:4] # returns second through 4th elements inclusive x[c(1,3,5)] # returns 1st, 3rd and 5th elements #compare each element of the vector with a value x < 37.0 #return only those elements of the vector that satisfy a specific condition x[ x < 37.0 ] # vector functions: # max(x), min(x), sum(x) # mean(x), sd(), median(x), range(x) # var(x) - simple variance # cor(x,y) - correlation between x and y # sort(x), rank(x), order(x) # cumsum(), cumprod(x), cummin(x), cumprod(x) # duplicated(x), unique(x) # summary() #---------------------------- # # Reading input files # #---------------------------- # # Read regular csv file: salt <- read.csv("http://rcs.bu.edu/classes/FC764/intersalt.csv") #This data contains median blood pressure, as a fuction of salt intake # b - numeric vector # bp - mean diastolic blood pressure (mm Hg) # na - mean sodium excretion (mmol/24h) # country #------------------------------ # # Exploring R dataframes # #------------------------------ # #Explore the regular dataset: #Look at the first and last few records head(salt) tail(salt) #Get the list of the columns names(salt) #Number of rows: nrow(salt) #Number of columns: ncol(salt) #Get the structure of the data: str(salt) #Get the summary of the data: summary(salt) # Get unique values of a vector unique(salt$country) #numeric data exploratory analysis min(salt$bp) max(salt$bp) range(salt$bp) summary(salt$bp) #------------------------ # # dataframes slicing # #------------------------ # # dataframe slicing (accessing elements) salt[2,3] # element, from the 2nd row, 3rd column salt[2,] # all elements from the 2nd row salt[ ,3] # all elements in the 3rd column salt$bp # accessing elemnts using variable name salt[salt$bp > 75, ] # list all the rows in the dataset for which variablebp is greater than 75 salt[salt$bp < 70, ] # list all the rows in the dataset for which variablebp is less than 70 #------------------------ # # Create dataframe # #------------------------ # organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast") genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000) estGeneCount<-c(30000,30000,13061,19099,6034) dt <- data.frame(organism=organism, genomesz=genomeSizeBP, genecount=estGeneCount) dt # Comute gene density (divide genome size by the genome count) dt$genomesz/dt$genecount # create a new column "base pairs" per gene dt$bp <- dt$genomesz/dt$genecount dt #------------------ # # Missing Values # #------------------ # x <- c(734, 145, NA, 456, NA) # define a numeric vector is.na(x) # check if the element in the vector is missing which(is.na(x)) # which elements are missing anyNA(x) # are there any missing data x == NA # this does not work ! - missing value cannot be compared to anything mean(x) # By default mamy statistical functions will not compute if the data contain missing values # Read help topic for the function ?mean #Perform computation removing the missing data mean(x, na.rm=TRUE) #--------------------------- # # Hypothesis Testing in R # #--------------------------- # # This dataset consists of gene expression measurements for 10 genes # under control and treatment conditions with 4 replicates each genht <- read.csv("http://rcs.bu.edu/classes/FC764/geHT.csv") str(genht) #All control values control <- c(genht$c1, genht$c2, genht$c3, genht$c4 ) # Perform One sample t-test with a Null hypothesis that the true mean is 2000 t.test(control, mu=2000) #The test result with a p-value of 0.03583 rejects the null hypothesis at a critical #value (alpha level) of 0.05 # 2-sample t-test treat <- c(genht$t1, genht$t2, genht$t3, genht$t4 ) t.test(control, treat) #The p-value for this test very strongly rejects the null hypothesis of no #difference between the mean of the treatment group and control group #------------------ # # R packages # #------------------ # # R contains many packages. Many of them can be found on the CRAN website # Bioconductor contains many packages used in bioinformatics #install.packages("tidyverse") #load package library(tidyverse) salt <- read_csv("http://rcs.bu.edu/classes/FC764/intersalt.csv") #Check the data glimpse(salt) # filter some records filter(salt, country == "Finland") # We can also use pipe operation : salt |> filter ( country == "Finland" ) # With pipe we can create a chain of commands to manipulate our input dataset: salt |> filter (country %in% c("Canada", "US", "Mexico", "Argentina", "Brazil", "Colombia")) |> group_by ( country ) |> summarize (N.obs = n(), ave.bp = mean(bp), max.bp = max(bp))