# R as a scientific calculator
2+3    # addition
2^3    # power
log(2) # built-in functions


#------------------ #
#     variables     #
#------------------ #

a <- 3
A <- 7  # R is case sensitive

# Avoid using names c, t, cat, F, T, D as those are built-in functions/constants

# The variable name can contain letters, digits, underscores and dots and 
# start with the letter or dot
# The variable name cannot contain dollar sign
str.var  <- "oxygen"   # character variable
num.var  <- 15.99                  # numerical variable
bool.var <- TRUE                   # boolean (or logical) variable
bool.var.1 <- F                    # logical values can be abbriviated


rna.str <- "TTAGAATT"    # string variable
mode(rna.str)            # get the type (or mode) of the object
nchar(rna.str)           # get the lenght of the string

#------------------ #
#   R help          #
#------------------ #

# Access help file for the R function
?sd
help(sd)

# Search for help
??"standard deviation"
help.search("standard deviation")
help.search("analysis of variance")


#------------------ #
#   R vectors       #
#------------------ #

# Vector is an array of R objects of the same type:
num.vector <- c( 5,7, 9,11, 4, -1, 0)

# Numeric vectors can be defined in a number of ways:
vals <- c (2, -7, 5, 3, -1 )             # concatenation
vals <- 25:75                            # range of values
vals <- seq(from=0, to=3, by=0.5)        # sequence definition
vals <- rnorm(5, mean=2, sd=1.5 )        # returns normally distributed values

# Vector arithmetic:
# systolic blood pressure values
SBP <- c(96, 110, 100, 125, 90 )

# diastolic blood pressure
DBP <- c(55, 70, 68, 100, 50)

# calculate MAP (mean arterial pressure)
MAP <- SBP / 3 + 2 * DBP / 3
print(MAP)


# R operators
#              +, -, *, /  - addition, subtraction, multiplication, division
#              ^ or **     - exponentiation
#              %in%        - membership
#              :           - sequence genration
#              <, <=, ==, >=, >, !=     - boolean comparative
#              |, ||       - OR  vectorized/not vectorized
#              &, &&       - AND
#              


# vector slicing (subsetting)
x <- c(36.6, 38.2, 36.4, 37.9, 41.0, 39.9, 36.8, 37.5)    # define a numeric vector
x[2]         # returns second element 
x[2:4]       # returns second through 4th elements inclusive
x[c(1,3,5)]  # returns 1st, 3rd and 5th elements

#compare each element of the vector with a value
x < 37.0

#return only those elements of the vector that satisfy a specific condition
x[ x < 37.0 ]    


# vector functions:
#               max(x),   min(x),     sum(x)
#               mean(x),  sd(), median(x),  range(x)
#               var(x)                            - simple variance
#               cor(x,y)                          - correlation between x and y
#               sort(x), rank(x), order(x)
#               cumsum(), cumprod(x), cummin(x), cumprod(x)
#               duplicated(x), unique(x)
#               summary()


#---------------------------- #
#   Reading input files       #
#---------------------------- #

# Read regular csv file:
salt <- read.csv("http://rcs.bu.edu/classes/FC764/intersalt.csv")

#This data contains median blood pressure, as a fuction of salt intake
# b - numeric vector
# bp - mean diastolic blood pressure (mm Hg)
# na - mean sodium excretion (mmol/24h)
# country 

#------------------------------ #
#   Exploring R dataframes      #
#------------------------------ #
#Explore the regular dataset:

#Look at the first and last few records
head(salt)
tail(salt)

#Get the list of the columns
names(salt)

#Number of rows:
nrow(salt)

#Number of columns:
ncol(salt)

#Get the structure of the data:
str(salt)

#Get the summary of the data:
summary(salt)

# Get unique values of a vector
unique(salt$country)


#numeric data exploratory analysis
min(salt$bp)
max(salt$bp)
range(salt$bp)
summary(salt$bp)

#------------------------ #
#   dataframes slicing    #
#------------------------ #
# dataframe slicing (accessing elements)
salt[2,3]    # element, from the 2nd row, 3rd column
salt[2,]     # all elements from the 2nd row
salt[ ,3]    # all elements in the 3rd column 

salt$bp  # accessing elemnts using variable name
salt[salt$bp > 75, ]  # list all the rows in the dataset for which variablebp is greater than 75
salt[salt$bp < 70, ]  # list all the rows in the dataset for which variablebp is less than 70


#------------------------ #
#   Create dataframe      #
#------------------------ #
organism<-c("Human","Mouse","Fruit Fly", "Roundworm","Yeast")
genomeSizeBP<-c(3000000000,3000000000,135600000,97000000,12100000)
estGeneCount<-c(30000,30000,13061,19099,6034)
dt <- data.frame(organism=organism,
                 genomesz=genomeSizeBP, 
                 genecount=estGeneCount)
dt

# Comute gene density (divide genome size by the genome count)
dt$genomesz/dt$genecount

# create a new column "base pairs" per gene
dt$bp <- dt$genomesz/dt$genecount
dt


#------------------ #
#   Missing Values  #
#------------------ #
x <- c(734, 145, NA, 456, NA)    # define a numeric vector
is.na(x)              # check if the element in the vector is missing
which(is.na(x))       # which elements are missing
anyNA(x)              # are there any missing data

x == NA   # this does not work ! - missing value cannot be compared to anything

mean(x)
# By default mamy statistical functions will not compute if the data contain missing values


# Read help topic for the function
?mean

#Perform computation removing the missing data
mean(x, na.rm=TRUE)

#--------------------------- #
#   Hypothesis Testing in R  #
#--------------------------- #

# This dataset consists of gene expression measurements for 10 genes 
# under control and treatment conditions with 4 replicates each
genht <- read.csv("http://rcs.bu.edu/classes/FC764/geHT.csv")

str(genht)


#All control values
control <- c(genht$c1, genht$c2, genht$c3, genht$c4  )


# Perform One sample t-test with a Null hypothesis that the true mean is 2000
t.test(control, mu=2000)
#The test result with a p-value of 0.03583 rejects the null hypothesis at a critical
#value (alpha level) of 0.05

# 2-sample t-test
treat <- c(genht$t1, genht$t2, genht$t3, genht$t4  )
t.test(control, treat)
#The p-value for this test very strongly rejects the null hypothesis of no
#difference between the mean of the treatment group and control group


#------------------ #
#   R packages      #
#------------------ #
# R contains many packages. Many of them can be found on the CRAN website 
# Bioconductor contains many packages used in bioinformatics


#install.packages("tidyverse")

#load package
library(tidyverse)

salt <- read_csv("http://rcs.bu.edu/classes/FC764/intersalt.csv")


#Check the data
glimpse(salt)

# filter some records
filter(salt, country == "Finland")

# We can also use pipe operation :
salt |> filter ( country == "Finland" )

# With pipe we can create a chain of commands to manipulate our input dataset:
salt |>
  filter (country %in% c("Canada", "US", "Mexico", "Argentina", "Brazil", "Colombia")) |>
  group_by ( country ) |>
  summarize (N.obs = n(), ave.bp = mean(bp), max.bp = max(bp))