## ------------------------------------
## Research Computing Services
## Boston University

## Basic Statistics in R
## Katia Bulekova, 2021
## ------------------------------------

library(readr)
library(ggplot2)
library(dplyr)

#install.packages("aod")  # for logistic regression
library(aod)

# Couple useful packages (optional)
#install.packages("skimr")
library(skimr)

#install.packages("rstatix")
library(rstatix)

# install.packages("GGally")
library(GGally)

## Let's read the two dataset we explored in the previous workshops:

USA.health <- read_csv("https://github.com/bu-rcs/bu-rcs.github.io/raw/main/Bootcamp/Data/USA_HealthData.csv") %>%
  rename(poor_health = "% Fair or Poor Health",
         p_smokers = "% Smokers",
         p_obese = "% Adults with Obesity",
         p_inactive = "% Physically Inactive",
         p_college="% Some College",
         p_housing_problems = "% Severe Housing Problems") %>%
  select(State, County, poor_health : p_inactive, p_college, p_housing_problems) %>%
  filter( !is.na(County) )



USA.demogr <- read_csv("https://github.com/bu-rcs/bu-rcs.github.io/raw/main/Bootcamp/Data/USA_DemographicsData.csv") %>%
  rename( life_exp = "Life Expectancy", 
          ph_distress = "% Frequent Physical Distress",
          limit_healthy_food = "% Limited Access to Healthy Foods",
          house_income = "Median Household Income",
          N_black = "# Black",
          N_rural = "# Rural") %>%
  select(State : house_income, Population, N_black, N_rural) %>%
  filter( !is.na(County) )


USA.data <- inner_join( USA.health, USA.demogr, by = c("State", "County"))


USA.regions <- read_csv("https://github.com/bu-rcs/bu-rcs.github.io/raw/main/Bootcamp/Data/us_states_census_regions.csv")

# Add the names of Regions
USA.data <- merge(USA.data, USA.regions, by = "State")

glimpse (USA.data)  
summary( USA.data)

## -----------------------------------------------------------------------------
# skimr package provides a quick summary of variables in the dataset
skim(USA.data)


# rstatix package returns summary statistics as a data.frame:
USA.data %>% 
  get_summary_stats(
    p_smokers, p_obese, p_inactive, Population, house_income,  # columns to calculate for
    type = "full")  

## -----------------------------------------------------------------------------
## Hypothesis Testing in R
## -----------------------------------------------------------------------------


## 1. State Hypothesis
## 2. Formulate an analysis plan
## 3. Perform analysis
## 4. Interpret results


## **Type I Error** - null hypothesis is rejected when it is true. 
## *significance level* - probability of Type I error 

## **Type II Error** - accepting a false null hypothesis. 
## *power* of the test - the probability that the test correctly rejects 
##            the null hypothesis or in other words the likelihood of 
##            of avoiding Type II error 

## See https://stats.idre.ucla.edu/r/dae/power-analysis-for-two-group-independent-sample-t-test/
## for examples computing power of the test


### ********************************************
### Checking if a variable normally distributed
### ********************************************

## Let's take a look at the *life expectancy* variable.
ggplot( USA.data, aes (life_exp) ) + 
  geom_histogram(binwidth=1) + 
  geom_vline(xintercept=median(USA.data$life_exp, na.rm=T), 
             linetype="dashed", color = "red", size=1.) + 
  ggtitle ("USA Life expectancy distribution") + 
  xlab( "Life expectancy, years") +
  annotate(geom="text", 
           x=82, y=400, 
           label=round(median(USA.data$life_exp, na.rm=T),1),
           color="red") + 
  theme_bw()


##  Is this variable normally distributed?
##  Let's first perform some *visual* analysis
ggplot( USA.data, aes (life_exp) ) + 
  geom_density(color="darkgray", fill="lightgrey") + 
  geom_vline(xintercept=median(USA.data$life_exp, na.rm=T), 
             linetype="dashed", color = "red", size=.8) + 
  ggtitle ("USA Life expectancy distribution") + 
  xlab( "Life expectancy, years") +
  annotate(geom="text", 
           x=82, y=0.13, 
           label=round(median(USA.data$life_exp, na.rm=T),1),
           color="red") + 
  theme_bw()


ggplot( USA.data, aes (sample = life_exp) ) + 
  stat_qq() + stat_qq_line() + 
  ggtitle ("qq-plot of  Life expectancy variable") + 
  theme_bw()


## 1. **State Hypothesis**: 

### *Null-hypothesis* - the sample comes from a normal distribution. 
### *Alternative hypothesis* - the sample does not come from a normal distribution


## 2. **Formulate an analysis plan**

### We will run Shapiro-Wilks test of normality ((http://www.sthda.com/english/wiki/normality-test-in-r))


## 3. **Perform analysis**
shapiro.test(USA.data$life_exp)


## 4. **Interpret results**

### From the output, the p-value < 0.05 implying that the distribution of the data 
### is significantly different from normal distribution. 
### So we reject the *Null Hypothesis* - data are not normally distributed







### ********************************************
### Comparing two samples (T-test)
### ********************************************

## Let's compare Life expectancy in New England and East South Ctr Region. 
## Let's first summarize our data and find median and mean values


USA.data %>%
  group_by(Region) %>%
  summarise ( mean = mean(life_exp, na.rm=T), median = median(life_exp, na.rm=T))



## Let's visually compare two distributions of this variable for both regions

ggplot( USA.data, aes (x=life_exp) ) + 
  geom_histogram(data = subset(USA.data, Region == "East South Ctr"), 
                 binwidth=1, fill = "red", alpha = 0.2) + 
  geom_histogram(data = subset(USA.data, Region == "New. Eng."), 
                 binwidth=1, fill = "blue", alpha = 0.2) + 
  geom_vline(xintercept=74.5, 
             linetype="dashed", color = "red", size=1.) + 
  geom_vline(xintercept=79.5, 
             linetype="dashed", color = "blue", size=1.) + 
  ggtitle ("USA Life expectancy in New England and East South Ctr Region") + 
  xlab( "Life expectancy, years") +
  annotate(geom="text", 
           x=75, y=77, 
           label=74.5,
           color="red") + 
  annotate(geom="text", 
           x=80, y=77, 
           label=79.5,
           color="blue") + 
  theme_bw()




## 1. **State Hypothesis**: 

###  *Null-hypothesis* - difference in means is equal to 0. 
###  *Alternative hypothesis* - true difference in means is not equal to 0.


## 2. **Formulate an analysis plan**

### We will run Welch Two Sample t-test 
### (https://www.datanovia.com/en/lessons/types-of-t-test/unpaired-t-test/welch-t-test/).


## 3. **Perform analysis**

x <- USA.data$life_exp[USA.data$Region == "New. Eng."]
y <- USA.data$life_exp[USA.data$Region == "East South Ctr"]
t.test(x,y)


## 4. **Interpret results**

### From the output, the p-value < 0.05 implying that the true difference in means is not
### equal to zero and we can reject *Null Hypothesis*. 
### It's always a good idea to include  in your report the
### 95% percent confidence interval and other  statistics along with the p-value.

## (Optional) T_test using rstatix package provides output in a tibble format
USA.data %>%
  filter(Region %in% c("New. Eng.", "East South Ctr")) %>%
  t_test(life_exp ~ Region)

#### **Exercise**

## 1. Check if the variable *house_income* normally distributed
## 2. Perform Welch t.test to compare *house_income* between New England Region and East South Ctr




### ********************************************
### Correlation and Covariance
### ********************************************

## Check if there is a correlation between percent of smokers in a County
## and *poor health* variable:

## Null Hypothesis:
# Percent of smokers and percent of people with poor health are independent.
# Or in other words the true correlation between these two variables is zero

ggplot( USA.data, aes(x = p_smokers, y = poor_health)) + 
  geom_point(alpha=0.1) + 
  geom_smooth(method = "lm") + 
  labs( title = "Adults with Poor Health vs. Percent of Smokers",
        subtitle="USA Population", 
        x =  "Smokers (%)",  
        y = "Poor Health (%)")


# Another way to look at this data:
ggpairs(USA.data, columns = c(3,4)) 


# Color based on a categorical variable
east.sub <- USA.data %>%
  filter(Region %in% c("New. Eng.", "East South Ctr", "East North Ctr")) 

ggpairs(east.sub, columns = c(3,4)) 
ggpairs(east.sub, columns = c(3,4), ggplot2::aes(colour=Region)) 

# Another useful option to ggpairs plot from GGally package:
ggpairs(
  east.sub, columns = c(3,4, 16),
  upper = list(continuous = "density", combo = "box_no_facet"),
  lower = list(continuous = "points", combo = "dot_no_facet")
)


# The `cov()` functions computes covariance. 
# The `cor()` function calculates correlations between two vectors. 
# The `cor.test()` function performs test of significance of the correlation.


cov( USA.data$p_smokers, USA.data$poor_health)
cor( USA.data$p_smokers, USA.data$poor_health)
cor.test( USA.data$p_smokers, USA.data$poor_health)

# Conclusion

# Based on the p-value we reject the Null hypothesis and state that there is
# enough evidence to conclude that the percentage of smokers and percent of people 
# with poor health are not independant (correlate with each other)
# are independent. 

#### **Exercise**

# Find correlation, covariance and perform test for correlation between *p_college* and 
# *p_smokers* variables.




### ********************************************
### Chi-square test 

### testing significant differences between groups
### ********************************************


#Let's assign a label to each County based on the median household income:
USA.data_extra <- USA.data %>%
  mutate( wealth = case_when(house_income < 50000 ~ "low",
                             house_income > 80000 ~ "high",
                             TRUE ~ "middle"))

table(USA.data_extra$wealth)



#  We would like to test if there is a relationship between two categorical variables:
#  *wealth* and *Region*. 

# Let's first create a contingency table of these two variables:
table(USA.data_extra$Region, USA.data_extra$wealth)
xtabs( ~Region+wealth, data=USA.data_extra)

## Let's also draw a barplot to visually represent the data:
ggplot(USA.data_extra) +
  aes(x = Region, fill = wealth) +
  geom_bar() +
  scale_fill_hue() +
  coord_flip() +
  theme_minimal()



##  1. **State Hypothesis**: 
###  *Null-hypothesis* - The variables *Region* and *wealth* are independent, 
###         there is no relationship between these two categorical variables. 
###  *Alternative hypothesis* - the variables are dependent. 
###         Knowing the value of one variable helps to predict the value of the other variable.


## 2. **Formulate an analysis plan**

### We will run Chi-square test of independence
### (http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r).


## 3. **Perform analysis**
res <- chisq.test( table(USA.data_extra$Region, USA.data_extra$wealth) )
res


##  When a warning such as *Chi-squared approximation may be incorrect* appears, 
## it means that the smallest expected frequencies is lower than 5. 
## To avoid this issue, we can either gather some levels or use the *Fisher's exact test*.

## 4. **Interpret results**

## From the output, the p-value < 0.05 implying that we can reject the Null hypothesis and state that there is a significant relationship between the *Region* and *wealth* variables.




### ********************************************
### Linear Regression
### ********************************************

### **Regression analysis** is a statistical technique for determining 
# the relationship between two or more than two variables.


# *Simple Linear Regression* - single explanatory variable

# *Multiple Linear Regression* - the value of the dependent variable depends on more than one
# explanatory variable.



# In order to visualise the linear relationship between independent and 
# dependent variables we use a scatterplot:
ggplot(USA.data) +
  aes(x = p_inactive, y = p_obese) +
  geom_point( alpha = .2) +
  labs( title = "p_obese vs. p_inactive",
        subtitle="USA Population", 
        x =  "p_inactive (%)",  
        y = "p_obese (%)")+
  theme_minimal()


## We can compute compute correlation between these two variables:
cor( USA.data$p_obese,  USA.data$p_inactive)

# Now we can run linear regression:
lm.fit <- lm( p_obese ~ p_inactive, data = USA.data)
summary(lm.fit)


## R^2 allows us to measure how good this model is. In our case *R-squared* is about 0.3.
## So our model explains only 30% of the data variability. 

# We also should look at the residuals. 
# Their plot should look random - we should see no patterns:

residuals <- lm.fit$residuals
fitted <- lm.fit$fitted.values

res <- data.frame(residuals,fitted )
ggplot(res, aes(x = fitted, y = residuals)) + 
  geom_point(alpha = 0.2) +
  labs(title = "Residuals vs Fitted") + 
  theme_minimal()


#### **Exercise**
## Perform linear regression for variables *poor_health* and *p_smokers*.


### ********************************************
### Logistic Regression
### ********************************************
## Example from UCLA (https://stats.idre.ucla.edu/r/dae/logit-regression/)
### Read the data
mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
head(mydata)
str(mydata)
summary(mydata)

##  Convert rank to be a categorical variable
mydata$rank <- factor(mydata$rank)
summary(mydata)

# two-way contingency table of categorical outcome and predictors we want
xtabs(~admit + rank, data = mydata)

# estimate a logistic regression model using the glm (generalized linear model) function
?glm  # glm is used to estimate various models. Specifying the "family" argument is important!
mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial")
summary(mylogit)

# deviance residuals -  a measure of model fit
# All (gre, gpa and rank) are statistically significant

# The logistic regression coefficients give the change in the log odds of 
# the outcome for a one unit increase in the predictor variable:

# For every one unit change in gre, the log odds of admission (versus non-admission) increases by 0.002
# For a one unit increase in gpa, the log odds of being admitted to graduate school increases by 0.804
# The indicator variables for rank have a slightly different interpretation:
#   - having attended an undergraduate institution with rank of 2, 
#     versus an institution with a rank of 1, changes the log odds of admission by -0.675

# use the confint function to obtain confidence intervals for the coefficient estimates
confint(mylogit)

# test for an overall effect of rank using the wald.test function of the aod library
wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 4:6)
# The chi-squared test statistic of 20.9, with three degrees of freedom is associated with 
# a p-value of 0.00011 indicating that the overall effect of rank is statistically significant



# exponentiate the coefficients and interpret them as odds-ratios
## odds ratios and 95% CI
exp(cbind(OR = coef(mylogit), confint(mylogit)))
## Interpretation:
## for a one unit increase in gpa, the odds of being admitted to 
# graduate school (versus not being admitted) increase by a factor of 2.23



# Calculate predicted probabilities
newdata1 <- with(mydata, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4)))

# measure of how well our model fits
# We want to check whether the model with predictors fits 
# significantly better than a model with just an intercept (i.e., a null model)
# The test statistic is the difference between the residual deviance 
# for the model with predictors and the null model. The test statistic is 
# distributed chi-squared with degrees of freedom equal to the differences 
# in degrees of freedom between the current and the null model 
# (i.e., the number of predictor variables in the model). 

# deviance for the two models (i.e., the test statistic)
with(mylogit, null.deviance - deviance)

# The degrees of freedom for the difference between the two models is equal 
# to the number of predictor variables in the model
with(mylogit, df.null - df.residual)

# p-value
with(mylogit, pchisq(null.deviance - deviance, 
                     df.null - df.residual, lower.tail = FALSE))

# The chi-square of 41.46 with 5 degrees of freedom and an associated p-value of
# less than 0.001 tells us that our model as a whole fits 
# significantly better than an empty model


#### -------------------------------------------------------------------------
#### Read more
#### -------------------------------------------------------------------------

# T.test
#      http://www.sthda.com/english/wiki/t-test-analysis-is-it-always-correct-to-compare-means#t-test
# Correlation analysis
#      http://www.sthda.com/english/wiki/correlation-test-between-two-variables-in-r#correlation-formula
# Chi-square test of independence
#      http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r
# Linear Regression
#      https://www.datacamp.com/community/tutorials/linear-regression-R