## ------------------------------------ ## Research Computing Services ## Boston University ## Basic Statistics in R ## Katia Bulekova, 2021 ## ------------------------------------ library(readr) library(ggplot2) library(dplyr) #install.packages("aod") # for logistic regression library(aod) # Couple useful packages (optional) #install.packages("skimr") library(skimr) #install.packages("rstatix") library(rstatix) # install.packages("GGally") library(GGally) ## Let's read the two dataset we explored in the previous workshops: USA.health <- read_csv("https://github.com/bu-rcs/bu-rcs.github.io/raw/main/Bootcamp/Data/USA_HealthData.csv") %>% rename(poor_health = "% Fair or Poor Health", p_smokers = "% Smokers", p_obese = "% Adults with Obesity", p_inactive = "% Physically Inactive", p_college="% Some College", p_housing_problems = "% Severe Housing Problems") %>% select(State, County, poor_health : p_inactive, p_college, p_housing_problems) %>% filter( !is.na(County) ) USA.demogr <- read_csv("https://github.com/bu-rcs/bu-rcs.github.io/raw/main/Bootcamp/Data/USA_DemographicsData.csv") %>% rename( life_exp = "Life Expectancy", ph_distress = "% Frequent Physical Distress", limit_healthy_food = "% Limited Access to Healthy Foods", house_income = "Median Household Income", N_black = "# Black", N_rural = "# Rural") %>% select(State : house_income, Population, N_black, N_rural) %>% filter( !is.na(County) ) USA.data <- inner_join( USA.health, USA.demogr, by = c("State", "County")) USA.regions <- read_csv("https://github.com/bu-rcs/bu-rcs.github.io/raw/main/Bootcamp/Data/us_states_census_regions.csv") # Add the names of Regions USA.data <- merge(USA.data, USA.regions, by = "State") glimpse (USA.data) summary( USA.data) ## ----------------------------------------------------------------------------- # skimr package provides a quick summary of variables in the dataset skim(USA.data) # rstatix package returns summary statistics as a data.frame: USA.data %>% get_summary_stats( p_smokers, p_obese, p_inactive, Population, house_income, # columns to calculate for type = "full") ## ----------------------------------------------------------------------------- ## Hypothesis Testing in R ## ----------------------------------------------------------------------------- ## 1. State Hypothesis ## 2. Formulate an analysis plan ## 3. Perform analysis ## 4. Interpret results ## **Type I Error** - null hypothesis is rejected when it is true. ## *significance level* - probability of Type I error ## **Type II Error** - accepting a false null hypothesis. ## *power* of the test - the probability that the test correctly rejects ## the null hypothesis or in other words the likelihood of ## of avoiding Type II error ## See https://stats.idre.ucla.edu/r/dae/power-analysis-for-two-group-independent-sample-t-test/ ## for examples computing power of the test ### ******************************************** ### Checking if a variable normally distributed ### ******************************************** ## Let's take a look at the *life expectancy* variable. ggplot( USA.data, aes (life_exp) ) + geom_histogram(binwidth=1) + geom_vline(xintercept=median(USA.data$life_exp, na.rm=T), linetype="dashed", color = "red", size=1.) + ggtitle ("USA Life expectancy distribution") + xlab( "Life expectancy, years") + annotate(geom="text", x=82, y=400, label=round(median(USA.data$life_exp, na.rm=T),1), color="red") + theme_bw() ## Is this variable normally distributed? ## Let's first perform some *visual* analysis ggplot( USA.data, aes (life_exp) ) + geom_density(color="darkgray", fill="lightgrey") + geom_vline(xintercept=median(USA.data$life_exp, na.rm=T), linetype="dashed", color = "red", size=.8) + ggtitle ("USA Life expectancy distribution") + xlab( "Life expectancy, years") + annotate(geom="text", x=82, y=0.13, label=round(median(USA.data$life_exp, na.rm=T),1), color="red") + theme_bw() ggplot( USA.data, aes (sample = life_exp) ) + stat_qq() + stat_qq_line() + ggtitle ("qq-plot of Life expectancy variable") + theme_bw() ## 1. **State Hypothesis**: ### *Null-hypothesis* - the sample comes from a normal distribution. ### *Alternative hypothesis* - the sample does not come from a normal distribution ## 2. **Formulate an analysis plan** ### We will run Shapiro-Wilks test of normality ((http://www.sthda.com/english/wiki/normality-test-in-r)) ## 3. **Perform analysis** shapiro.test(USA.data$life_exp) ## 4. **Interpret results** ### From the output, the p-value < 0.05 implying that the distribution of the data ### is significantly different from normal distribution. ### So we reject the *Null Hypothesis* - data are not normally distributed ### ******************************************** ### Comparing two samples (T-test) ### ******************************************** ## Let's compare Life expectancy in New England and East South Ctr Region. ## Let's first summarize our data and find median and mean values USA.data %>% group_by(Region) %>% summarise ( mean = mean(life_exp, na.rm=T), median = median(life_exp, na.rm=T)) ## Let's visually compare two distributions of this variable for both regions ggplot( USA.data, aes (x=life_exp) ) + geom_histogram(data = subset(USA.data, Region == "East South Ctr"), binwidth=1, fill = "red", alpha = 0.2) + geom_histogram(data = subset(USA.data, Region == "New. Eng."), binwidth=1, fill = "blue", alpha = 0.2) + geom_vline(xintercept=74.5, linetype="dashed", color = "red", size=1.) + geom_vline(xintercept=79.5, linetype="dashed", color = "blue", size=1.) + ggtitle ("USA Life expectancy in New England and East South Ctr Region") + xlab( "Life expectancy, years") + annotate(geom="text", x=75, y=77, label=74.5, color="red") + annotate(geom="text", x=80, y=77, label=79.5, color="blue") + theme_bw() ## 1. **State Hypothesis**: ### *Null-hypothesis* - difference in means is equal to 0. ### *Alternative hypothesis* - true difference in means is not equal to 0. ## 2. **Formulate an analysis plan** ### We will run Welch Two Sample t-test ### (https://www.datanovia.com/en/lessons/types-of-t-test/unpaired-t-test/welch-t-test/). ## 3. **Perform analysis** x <- USA.data$life_exp[USA.data$Region == "New. Eng."] y <- USA.data$life_exp[USA.data$Region == "East South Ctr"] t.test(x,y) ## 4. **Interpret results** ### From the output, the p-value < 0.05 implying that the true difference in means is not ### equal to zero and we can reject *Null Hypothesis*. ### It's always a good idea to include in your report the ### 95% percent confidence interval and other statistics along with the p-value. ## (Optional) T_test using rstatix package provides output in a tibble format USA.data %>% filter(Region %in% c("New. Eng.", "East South Ctr")) %>% t_test(life_exp ~ Region) #### **Exercise** ## 1. Check if the variable *house_income* normally distributed ## 2. Perform Welch t.test to compare *house_income* between New England Region and East South Ctr ### ******************************************** ### Correlation and Covariance ### ******************************************** ## Check if there is a correlation between percent of smokers in a County ## and *poor health* variable: ## Null Hypothesis: # Percent of smokers and percent of people with poor health are independent. # Or in other words the true correlation between these two variables is zero ggplot( USA.data, aes(x = p_smokers, y = poor_health)) + geom_point(alpha=0.1) + geom_smooth(method = "lm") + labs( title = "Adults with Poor Health vs. Percent of Smokers", subtitle="USA Population", x = "Smokers (%)", y = "Poor Health (%)") # Another way to look at this data: ggpairs(USA.data, columns = c(3,4)) # Color based on a categorical variable east.sub <- USA.data %>% filter(Region %in% c("New. Eng.", "East South Ctr", "East North Ctr")) ggpairs(east.sub, columns = c(3,4)) ggpairs(east.sub, columns = c(3,4), ggplot2::aes(colour=Region)) # Another useful option to ggpairs plot from GGally package: ggpairs( east.sub, columns = c(3,4, 16), upper = list(continuous = "density", combo = "box_no_facet"), lower = list(continuous = "points", combo = "dot_no_facet") ) # The `cov()` functions computes covariance. # The `cor()` function calculates correlations between two vectors. # The `cor.test()` function performs test of significance of the correlation. cov( USA.data$p_smokers, USA.data$poor_health) cor( USA.data$p_smokers, USA.data$poor_health) cor.test( USA.data$p_smokers, USA.data$poor_health) # Conclusion # Based on the p-value we reject the Null hypothesis and state that there is # enough evidence to conclude that the percentage of smokers and percent of people # with poor health are not independant (correlate with each other) # are independent. #### **Exercise** # Find correlation, covariance and perform test for correlation between *p_college* and # *p_smokers* variables. ### ******************************************** ### Chi-square test ### testing significant differences between groups ### ******************************************** #Let's assign a label to each County based on the median household income: USA.data_extra <- USA.data %>% mutate( wealth = case_when(house_income < 50000 ~ "low", house_income > 80000 ~ "high", TRUE ~ "middle")) table(USA.data_extra$wealth) # We would like to test if there is a relationship between two categorical variables: # *wealth* and *Region*. # Let's first create a contingency table of these two variables: table(USA.data_extra$Region, USA.data_extra$wealth) xtabs( ~Region+wealth, data=USA.data_extra) ## Let's also draw a barplot to visually represent the data: ggplot(USA.data_extra) + aes(x = Region, fill = wealth) + geom_bar() + scale_fill_hue() + coord_flip() + theme_minimal() ## 1. **State Hypothesis**: ### *Null-hypothesis* - The variables *Region* and *wealth* are independent, ### there is no relationship between these two categorical variables. ### *Alternative hypothesis* - the variables are dependent. ### Knowing the value of one variable helps to predict the value of the other variable. ## 2. **Formulate an analysis plan** ### We will run Chi-square test of independence ### (http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r). ## 3. **Perform analysis** res <- chisq.test( table(USA.data_extra$Region, USA.data_extra$wealth) ) res ## When a warning such as *Chi-squared approximation may be incorrect* appears, ## it means that the smallest expected frequencies is lower than 5. ## To avoid this issue, we can either gather some levels or use the *Fisher's exact test*. ## 4. **Interpret results** ## From the output, the p-value < 0.05 implying that we can reject the Null hypothesis and state that there is a significant relationship between the *Region* and *wealth* variables. ### ******************************************** ### Linear Regression ### ******************************************** ### **Regression analysis** is a statistical technique for determining # the relationship between two or more than two variables. # *Simple Linear Regression* - single explanatory variable # *Multiple Linear Regression* - the value of the dependent variable depends on more than one # explanatory variable. # In order to visualise the linear relationship between independent and # dependent variables we use a scatterplot: ggplot(USA.data) + aes(x = p_inactive, y = p_obese) + geom_point( alpha = .2) + labs( title = "p_obese vs. p_inactive", subtitle="USA Population", x = "p_inactive (%)", y = "p_obese (%)")+ theme_minimal() ## We can compute compute correlation between these two variables: cor( USA.data$p_obese, USA.data$p_inactive) # Now we can run linear regression: lm.fit <- lm( p_obese ~ p_inactive, data = USA.data) summary(lm.fit) ## R^2 allows us to measure how good this model is. In our case *R-squared* is about 0.3. ## So our model explains only 30% of the data variability. # We also should look at the residuals. # Their plot should look random - we should see no patterns: residuals <- lm.fit$residuals fitted <- lm.fit$fitted.values res <- data.frame(residuals,fitted ) ggplot(res, aes(x = fitted, y = residuals)) + geom_point(alpha = 0.2) + labs(title = "Residuals vs Fitted") + theme_minimal() #### **Exercise** ## Perform linear regression for variables *poor_health* and *p_smokers*. ### ******************************************** ### Logistic Regression ### ******************************************** ## Example from UCLA (https://stats.idre.ucla.edu/r/dae/logit-regression/) ### Read the data mydata <- read.csv("https://stats.idre.ucla.edu/stat/data/binary.csv") head(mydata) str(mydata) summary(mydata) ## Convert rank to be a categorical variable mydata$rank <- factor(mydata$rank) summary(mydata) # two-way contingency table of categorical outcome and predictors we want xtabs(~admit + rank, data = mydata) # estimate a logistic regression model using the glm (generalized linear model) function ?glm # glm is used to estimate various models. Specifying the "family" argument is important! mylogit <- glm(admit ~ gre + gpa + rank, data = mydata, family = "binomial") summary(mylogit) # deviance residuals - a measure of model fit # All (gre, gpa and rank) are statistically significant # The logistic regression coefficients give the change in the log odds of # the outcome for a one unit increase in the predictor variable: # For every one unit change in gre, the log odds of admission (versus non-admission) increases by 0.002 # For a one unit increase in gpa, the log odds of being admitted to graduate school increases by 0.804 # The indicator variables for rank have a slightly different interpretation: # - having attended an undergraduate institution with rank of 2, # versus an institution with a rank of 1, changes the log odds of admission by -0.675 # use the confint function to obtain confidence intervals for the coefficient estimates confint(mylogit) # test for an overall effect of rank using the wald.test function of the aod library wald.test(b = coef(mylogit), Sigma = vcov(mylogit), Terms = 4:6) # The chi-squared test statistic of 20.9, with three degrees of freedom is associated with # a p-value of 0.00011 indicating that the overall effect of rank is statistically significant # exponentiate the coefficients and interpret them as odds-ratios ## odds ratios and 95% CI exp(cbind(OR = coef(mylogit), confint(mylogit))) ## Interpretation: ## for a one unit increase in gpa, the odds of being admitted to # graduate school (versus not being admitted) increase by a factor of 2.23 # Calculate predicted probabilities newdata1 <- with(mydata, data.frame(gre = mean(gre), gpa = mean(gpa), rank = factor(1:4))) # measure of how well our model fits # We want to check whether the model with predictors fits # significantly better than a model with just an intercept (i.e., a null model) # The test statistic is the difference between the residual deviance # for the model with predictors and the null model. The test statistic is # distributed chi-squared with degrees of freedom equal to the differences # in degrees of freedom between the current and the null model # (i.e., the number of predictor variables in the model). # deviance for the two models (i.e., the test statistic) with(mylogit, null.deviance - deviance) # The degrees of freedom for the difference between the two models is equal # to the number of predictor variables in the model with(mylogit, df.null - df.residual) # p-value with(mylogit, pchisq(null.deviance - deviance, df.null - df.residual, lower.tail = FALSE)) # The chi-square of 41.46 with 5 degrees of freedom and an associated p-value of # less than 0.001 tells us that our model as a whole fits # significantly better than an empty model #### ------------------------------------------------------------------------- #### Read more #### ------------------------------------------------------------------------- # T.test # http://www.sthda.com/english/wiki/t-test-analysis-is-it-always-correct-to-compare-means#t-test # Correlation analysis # http://www.sthda.com/english/wiki/correlation-test-between-two-variables-in-r#correlation-formula # Chi-square test of independence # http://www.sthda.com/english/wiki/chi-square-test-of-independence-in-r # Linear Regression # https://www.datacamp.com/community/tutorials/linear-regression-R