# ---------------------------------------# # # # Graphics in R using ggplot2 # # Katia Bulekova # # # # ---------------------------------------# ## ggplot2 cheat sheet: # https://github.com/rstudio/cheatsheets/blob/master/data-visualization-2.1.pdf ## R Graphics CookBook: # https://r-graphics.org/ ## ggplot2 tutorial: # http://r-statistics.co/Complete-Ggplot2-Tutorial-Part1-With-R-Code.html ## ggplot2: elegant graphics for data analysis # https://ggplot2-book.org/ #To install ggplot2 package: # install.packages("ggplot2") library(dplyr) library(ggplot2) ## --------------------------- ## ## Load Input Data ## ## --------------------------- ## ## Throughout this tutorial we will use a (non-random) subset of the NHANES dataset: #https://wwwn.cdc.gov/nchs/nhanes/ # There are a number of weights associated with the variables in the original dataset # that we will ignore for this tutorial # As a result the distributions of the variables and their relationship cannot be # projected for the whole population. nhanes <- read.csv("http://rcs.bu.edu/examples/r/tutorials/Datasets/NHANES.csv") # During this tutorial we will go through all the phases of our data analysis # which we described during our "Data Wrangling" tutorial and see how using graphics # we can enhance each step ## --------------------------- ## ## Exploring the Data ## ## --------------------------- ## head(nhanes) str(nhanes) summary(nhanes) # Let's convert categorical variables into factors: nhanes <- nhanes %>% mutate(Sex = factor(Sex), Race = factor(Race), HealthStatus = factor(HealthStatus), HeartAttack = factor(HeartAttack), Diabetes = factor(Diabetes), Rural = factor(Rural), Region = factor(Region)) summary(nhanes) ## --------------------------- ## ## Numeric Variables ## ## Histograms ## ## --------------------------- ## ggplot( nhanes, aes(x = Height)) + geom_histogram() ggplot( nhanes, aes(x = Height)) + geom_histogram(binwidth =5) # At this point, while we are only exploring our data, # we do not need to worry about details like colors, fonts, etc. # Later we will learn how we can adjust those and make our graph more informative # We can add more layers to our graph. Let's add a density curve: ggplot( nhanes, aes(x = Height)) + geom_histogram(aes(y=..density..), binwidth =5) + geom_density() # === Exercise: === # Build a histogram for the SBP variable # What binwidth value is reasonable in this case? # ggplot( nhanes, aes(x = *** )) + geom_histogram( ) # Compare the Height distribution by Sex ggplot( nhanes, aes(x = Height, fill = Sex)) + geom_histogram(binwidth =5) ggplot( nhanes, aes(x = Height, fill = Sex)) + geom_histogram(binwidth =5, alpha=.5) ## --------------------------- ## ## Numeric Variables ## ## Boxplots ## ## --------------------------- ## ggplot( nhanes, aes(x = Height)) + geom_boxplot() # Compare Heights distribution for different demographics groups ggplot( nhanes, aes(x = Race, y = Height)) + geom_boxplot() # === Exercise: === # Plot Boxplot of DBP separately for each gender # Sometimes, instead of a simple boxplot, a violine plot might be more informative: ggplot( nhanes, aes(x = Race, y = Height)) + geom_violin()+ geom_jitter(width=.25) ## --------------------------- ## ## Numeric Variables ## ## Scatterplots ## ## --------------------------- ## ggplot( nhanes, aes(x = Height, y = Weight)) + geom_point() # Adding a regression line ggplot( nhanes, aes(x = Height, y = Weight)) + geom_point() + geom_smooth() # A linear regression might be a good approximation here: ggplot( nhanes, aes(x = Height, y = Weight)) + geom_point() + geom_smooth(method = "lm") # As before we might want to explore the relationship between these two variables # separately for Males and Females ggplot( nhanes, aes(x = Height, y = Weight, fill= Sex)) + geom_point(aes(col=Sex)) + geom_smooth(method = "lm", aes(col=Sex)) # === Exercise: === # Plot SBP vs DBP ## --------------------------- ## ## Categorical Variables ## ## Barplots ## ## --------------------------- ## ggplot( nhanes, aes(x=Race) )+ geom_bar(stat="count") # Add labels ggplot( nhanes, aes(x=Race) )+ geom_bar(stat="count") + geom_text(stat='count', aes(label=..count..), vjust=-1) # Alternatively nhanes %>% count( Race ) %>% ggplot( aes(x = Race, y= n) ) + geom_bar(stat="identity") + geom_text(aes(label=n), vjust=-1) # When we have an ordinal variable, we might consider using stacked barplots: nhanes %>% count( Race, HealthStatus ) %>% ggplot( aes(x = Race, y= n, fill=HealthStatus)) + geom_bar(stat="identity",position="stack") + scale_fill_grey(end = 0.2, start=0.8) # === Exercise: === # Plot a barplot for the four regions in the dataset # Side-by-side barplots nhanes %>% count( Race, HealthStatus ) %>% ggplot( aes(x = Race, y= n, fill=HealthStatus)) + geom_bar(stat="identity",position="dodge") # Same with the labels nhanes %>% count( Race, HealthStatus ) %>% ggplot( aes(x = Race, y= n, fill=HealthStatus)) + geom_bar(stat="identity",position="dodge") + geom_text(aes(label=n), position = position_dodge(0.9), vjust = -1) ## --------------------------- ## ## Categorical Variables ## ## Piecharts and Donut plots ## ## --------------------------- ## # Warning: # Piecharts are not recommended to use for graphics exploration # Barcharts might be a better choice for this purpose. # However if a pie-chart is well annotated, it might be used for # data communication nhanes.region <- nhanes %>% count( Region ) %>% mutate(fraction =n / sum(n), # Compute percentages ymax = cumsum (fraction), # Coord. of top for each rectangle ymin = c(0, head(ymax, n=-1)), # Coord. of top for each rectangle pos = (ymin + ymax)/2, label = paste0(Region, "\n", n) ) ggplot(nhanes.region , aes(xmin=3, xmax=4, ymin=ymin, ymax=ymax, fill=Region)) + geom_rect() + geom_text( x=3.5, aes(y=pos, label=label), size=6) ggplot(nhanes.region , aes(xmin=3, xmax=4, ymin=ymin, ymax=ymax, fill=Region)) + geom_rect() + geom_text( x=2, aes(y=pos, label=label), size=6) + # x here controls label position (inner / outer) coord_polar(theta="y") + xlim(c(-1, 4)) + theme_void() + theme(legend.position = "none") #===================================== # # Changing Title and Axis labels # #===================================== ?labs # title # subtitle # caption # x # y # Save one of our previous plots gplot <- ggplot( nhanes, aes(x = Height, y = Weight, fill= Sex)) + geom_point(aes(col=Sex)) + geom_smooth(method = "lm", aes(col=Sex)) # Add Labels to axis and title to the plot gplot+labs (title = "Weight vs. Height", subtitle = "Male and Female subjects", caption = "National Health and Nutrition Examination Survey", x = "Height", y = "Weight") # The title and axis labels can also be added separately gplot+ xlab("Height") + ylab("Weight") + ggtitle("Weight vs. Height", subtitle="Male and Female subjects") #===================================== # # Changing axis limits # #===================================== gplot+ ylim(30, 120) #============================================================ # # Changing appearance of the points (color, size, symbol) # #============================================================ ?geom_point # colour, shape, size # if the colour, shape and size do not change: ggplot( nhanes, aes(x = Height, y = Weight)) + geom_point( colour = "blue", size = 2, shape = 15 ) # if however we would like to change color based on another variable # we would put these arguments in the "aes" function: ggplot( nhanes, aes(x = Height, y = Weight)) + geom_point( size = 2, aes( colour = Sex ) ) # The same can also be achived using "col" argument in the main ggplot() function: ggplot( nhanes, aes(x = Height, y = Weight, col=Sex)) + geom_point( size = 2 ) # Compare: # In the first plot, the color argument is applied to the points only # While in the second it also affects the regressin line ggplot( nhanes, aes(x = Height, y = Weight)) + geom_point( size = 2, aes( colour = Sex ) ) + geom_smooth(method="lm") ggplot( nhanes, aes(x = Height, y = Weight, col=Sex)) + geom_point( size = 2 )+ geom_smooth(method="lm") #==================# # # # Legend Position # # # #==================# gplot1 <- gplot+labs (title = "Weight vs. Height", subtitle = "Male and Female subjects", caption = "National Health and Nutrition Examination Survey", x = "Height", y = "Weight") # remove legend gplot1 + theme(legend.position="None") # or place it to a different location gplot1 + theme(legend.position="bottom") #============================================================ # # Changing "theme" # #============================================================ ##theme_set(theme_classic()) # globally change the theme # or change the theme for a single graph: gplot1 + theme_bw() gplot1 + theme_classic() gplot1 + theme_dark() gplot1 + theme_minimal() gplot1 + theme_clean() gplot1 + theme_economist() gplot1 + theme_tufte() #============================================================ # # Facets: display multiple plots at once for each individual category # #============================================================ # let's display each scatter plots separately for male and female cats gplot2 <- ggplot( nhanes, aes( x = Age, y = SBP ) ) + labs (title = "Diastolic Blood Presure as of function of Age", x = "Age", y = "DBP") + geom_point( size = 2 ) + geom_smooth(method="lm") gplot2 + facet_wrap( ~Sex) # by default x and y axis will have the same limit, but # this can be changed: gplot2 + facet_wrap( ~Sex, scales="free") # === Exercise: === # Using the gplot2 dataset, try to use 2 variables in the facet_wrap function: # ~ Sex + Race # Now try to use facet_grid() function instead of facet_wrap() # And notice the difference in how the titles in each subplot change #---------------------- # Flip coordinates #---------------------- ggplot( nhanes, aes(x=HealthStatus) )+ geom_bar(stat="count") # Sometimes it is useful to flip the plot, for example when # the labels on the x axis overlap ggplot( nhanes, aes(x=HealthStatus) )+ geom_bar(stat="count") + coord_flip()