#---------------------------------------# # # # Data Wrangling in R # # Research Computing Services # # # # Katia Bulekova # # # #---------------------------------------# #Data Science Workflow # 1. Import # 2. Clean (Tidy) # 3. Understand: # - Transform # - Visualize # - Model # 4. Communicate # The tidyverse is an opinionated collection of R packages designed for data science. # All packages share an underlying design philosophy, grammar, and data structures. # *** Core tidyverse packages *** # readr - import data # tidyr - for data "tidying" # dplyr - data manipulation # stringr - string manipulation # forcats -factors handling # ggplot2 - data visualization # tibble - modern representation of a data frame # purrr - functional programming # # *** Other related packages *** # hms - time manipulation # lubridate - dates/times manimpulation # DBI - databases # haven - data imported from SPSS, SAS and Stata # readxl - excel data import # rvest - web scaping #---------------------------------------------- # install.packages("tidyverse") library(tidyverse) # Tidyverse has its own read_csv() function to read csv files. # Compare: df1 <- read.csv( "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv") df2 <- read_csv( "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv") class(df1) class(df2) head(df1) head(df2) # Notice the difference between the print() function behavior for df1 and df2: print(df1) print(df2) # To print more rows and/or columns, use arguments: print(df2, n = 20, width = Inf) library(readxl) # Reading data saved in Excel (xslx) format often have some challenges visit <- read_excel("MedData.xlsx", sheet = "Visits") visit # We can see that the file contains some extra records at the top and at the bottom of the sheet. # Use options to read_excel() function to skip these extra lines: visit <- read_excel("MedData.xlsx", sheet = "Visits", skip = 3, # how many lines to skip at the top of the sheet n_max = 13) # how many observations to read visit # Let's take a look at the type of each record to make sure that each column was read correctly # This can be done with str() function. # However dplyr package has also handy glimpse() function that has a nicely formatted output str(visit) glimpse(visit) # Notice that DBP column is marked as "character" column even though it contains numeric values. # This normally means that there are some missing values that were coded in non-standard way, so # R did not recognize them as missing values. # In this case you should examine the column and try to identify the missing values: min(visit$DBP) #[1] "##N/A" # Now we can go back and read file again adding "##N/A" as a Missing Data code: visit <- read_excel("MedData.xlsx", sheet = "Visits", skip = 3, n_max = 13, na=c("","NA","##N/A")) glimpse(visit) ##-------------------------# ## |> and %>% "pipes" ##------------------------- # Native R's pipe symbol |> was introduced in 4.1.0 # It allows to send the output of one function as an input object to the next function: visit |> glimpse() # You may see another "pipe" symbol used much more frequently. # %>% "pipe" symbol was introduced in magrittr package long time ago and # then it was adopted by all other packages in tidyverse library visit %>% glimpse() visit %>% summary() # The following chain of functions: sort(unique(visit$`Patient ID`), decreasing=TRUE) # can be rewritten as visit$`Patient ID` %>% unique() %>% # find unique values sort(decreasing=TRUE) # sort in descending order #-------------------------# # exercise #------------------------- # Read and explore "Patient Data" sheet from the same excel file # pinfo <- read_excel(-----) # Use glimpse, head, summary, and other functions to explore the dataset #------------------------------------------------------------------------- #--------------------------------------------------------------- ## A dataset is a collection of values, usually either # - numbers (if quantitative), # - strings (if qualitative), # - logical (if binary). # Every value belongs to a variable and an observation. # A variable contains all values that measure the same underlying attribute (like phone, age) across units. # An observation contains all values measured on the same unit (like a person, or a day, or an event) across attributes ## Clean or Tidy Data: # - Each variable forms a column # - Each observation forms a row # Each type of observational unit forms a table ## Examples of messy datasets # - Column headers are values, not variable names # - Multiple variables are stored in one column # - Variables stored in both rows and columns # - Multiple types of observational units are stored in the same table # - A single observational unit is stored in multiple tables. #--------------------------------------------------------------------------- # # Using dplyr package to # - Rename variables ( rename() ) # - Filter observations by their values ( filter() ) # - Reorder rows ( arrange() ) # - Select specific columns/variables ( select() ) # - Create new variables with functions of existing variables ( mutate() ) # - Summarise (summarise() ) # We can select columns in a Data Frame using "$" symbol, i.e: visit$DBP # It is not as easy to do so when the column name contains spaces or other special characters: visit$`Patient ID` # In some cases in makes sense to rename some columns to make it easy to work visit.clean <- visit %>% rename(id="Patient ID", admission="Admission date", discharge="Discharge date", pulse="Heart Rate") #--------------------------------------------- # Working with strings #--------------------------------------------- # Converting strings to upper and lower case toupper(visit.clean$Allergies) # or using pipe operator visit.clean$Allergies %>% toupper() #Search character vector for a specific pattern: grepl ("pain", visit.clean$Symptoms, ignore.case=TRUE) # Find all strings that contain either fever or pain (or both) grepl ("fever|pain", visit.clean$Symptoms, ignore.case=TRUE) # grep( value=FALSE): returns a vector of indices of element where pattern is found # grep( value=TRUE): returns a vector of indices of element where pattern is found grep ("fever", visit.clean$Symptoms, ignore.case=TRUE, value=FALSE) grep ("fever", visit.clean$Symptoms, ignore.case=TRUE, value=TRUE) #------------------------------------------ # Working with dates #------------------------------------------ class(visit.clean$admission) curr.time <- Sys.time() # get current date and time curr.date <- Sys.Date() # get current date str(curr.time) # view the structure of an object class(curr.time) # view type of an object # some systems do not have timezone set up Sys.timezone() # convert character string to POSIXlt t1=as.POSIXct("2019-01-29 11:30:00", "%Y-%m-%d %H:%M:%S", tz="EST") OlsonNames() # list of Time Zones str(t1) library(lubridate) mydates <- c("2/25/2018", "3/5/2017", "4-18-2018", "7.5.2017") class(mydates) newdates <- mdy(mydates) class(newdates) # There are a number of handy packages that have various functions to work with dates: # lubridate # chron # Some more examples working with dates and times can be found: # http://rcs.bu.edu/examples/r/timesSeries/dateTime.R # Here we will calculate the length of stay of each patient in the hospital visit.clean$discharge - visit.clean$admission #The result variable has "difftime" class: stay <- visit.clean$discharge - visit.clean$admission class(stay) #To convert it to a numeric value use as.numeric() function: as.numeric(visit.clean$discharge - visit.clean$admission) # ******************************************************************************** #---------------------- #Filtering rows #---------------------- visit.clean %>% filter( !is.na(DBP) ) ## Filter observations # R logical operators # ? Comparoson # > >= # == != # < <= # is.na !is.na # %in% ## ************ # Exercise: ## ************ # Select only those records for which pulse columns have values 100 and greater # visit.clean %>% filter( --- ) # Select only those records for which DBP is less than 60 or SBP is greater than 120 # visit.clean %>% filter( --- | ---) # Select only those records for which Temperature is greater than 99 and Symptoms include "fever" # visit.clean %>% filter( Temperature --- & grepl("---", Symptoms, ignore.case=T) ) #*********** #---------------------- # Selecting Columns #---------------------- visit.clean %>% select(id, Temperature:pulse) # dplyr comes with a set of helper functions that can help you select groups of variables inside a select() call: # starts_with("XY"): every name that starts with "XY", # ends_with("XY"): every name that ends with "XY", # contains("XY"): every name that contains "XY", # matches("XY"): every name that matches "XY", where "XY" can be a regular expression, # num_range("x", 1:15): the variables named x01, x02, x03,..., x15, # one_of(XY): every name that appears in x, where XY is a character vector. visit.clean %>% select( id, ends_with("BP") ) # Another useful function to specify the order of the columns is "relocate" visit.clean %>% relocate(Temperature, .after = pulse) #--------------------------------------------------- # Modifying existing and/or creating new variables #--------------------------------------------------- visit.clean %>% mutate( Temperature = (Temperature-32)*5/9, stay = as.numeric(discharge - admission) ) ## ************ # Exercise: ## ************ # Create a new column MAP which is equal to SBP/3 + 2*DBP/3 #visit.clean %>% mutate( --- ) # Let's put it all together: # Use visit.clean dataframe as input and # - select only those columns where Temperature is greater than 99F # - select columns ID, DBP and SBP # - calculate new variable MAP #res <- visit.clean %>% # filter( --- ) %>% # select( --- ) %>% # mutate( --- ) # res #--------------------------------------------------- # Calculating summaries #--------------------------------------------------- visit.clean %>% summarise( NumVisits = n(), max.t = max(Temperature), min.T = min(Temperature) ) # built-in functions often used within summarise() # averages: mean(), median() # spread: sd(), IQR(), mad() # range: min(), max() # count: n(), n_distinct() ## ************ # Exercise: ## ************ # Calculate the number of distinct patients # visit.clean %>% summarise( N = --- ) #--------------------------------------------------- # Sorting dataframe by one or more variables #--------------------------------------------------- visit.clean %>% arrange(id, admission) # sort in descending order visit.clean %>% arrange(id, desc(admission)) #--------------------------------------------------- # Group by one or more variables #--------------------------------------------------- visit.clean %>% group_by(id) %>% summarise( ave.pulse = mean(pulse) ) # Useful functions often used with group_by() # first(), last(), nth() # For each patient select the first record and find the length of stay visit.clean %>% group_by(id) %>% arrange(admission) %>% summarise( first(discharge - admission) ) #--------------------------------------------------- # Joining 2 dataframes #--------------------------------------------------- # First let's read the data for each patient: pinfo <- read_excel("MedData.xlsx", sheet = "Patient Data") #let's make sure the date is well formatted pinfo %>% head() pinfo %>% glimpse() pinfo %>% summary() # Now we want to have a single dataframe that contains patient information and patient visit information # There are a number of join* functions in dplyr package: # inner_join # left_join # right_join # full_join # semi_join - return all rows from x where there are matching values in y, keeping just columns from x. # anti_join - return all rows from x where there are not matching values in y, keeping just columns from x. #Let's try full join of both dataframes we have full_join(visit.clean, pinfo, by = c("id"="ID")) # left join result <- visit.clean %>% left_join(pinfo, by = c("id"="ID")) result ## ****************** # Exercise ## ****************** # Calculate the mean stay of the hospital for Male and Female patients: # result %>% # filter( --- ) %>% # select observations where Gender is not missing # mutate( stay = --- ) %>% # create a new variable stay equal to the length of stay in the hospital # group_by ( --- ) %>% # group by gender # summarize( ave.stay = --- ) # calculate mean length of stay ## ************************************************************ ## Converting Wide dataframes to long dataframes and back ## ************************************************************* library(tidyr) city.temps <- data.frame( time = as.Date('2018-09-03') + 0:4, Boston = rnorm( 5, 75, 3), Denver = rnorm( 5, 85, 5), SanFrancisco = rnorm(5, 80, 5), Austin = rnorm( 5, 90, 5) ) city.temps # Use gather to rearrange a table into a tidy data.frame: # gather(data, key = "key", value = "value", ..., na.rm = FALSE, # convert = FALSE, factor_key = FALSE) city.temps2 <- pivot_longer( city.temps, Boston:Austin, names_to = "City", values_to = "Temperature" ) # or city.temps2 <- pivot_longer( city.temps, !time, names_to = "City", values_to = "Temperature" ) city.temps2 glimpse(city.temps2) # Sometimes it is useful to be able to # perform the opposite operation: convert long format dataframe into wide representation city.temps3 <- pivot_wider( city.temps2, names_from = "City", values_from = "Temperature") city.temps3 ## ----------------------------------- ## Additional resources: ## ----------------------------------- # 1. R for applied epidemiology and public health: https://epirhandbook.com/en/ # 2. R for data science: https://r4ds.had.co.nz/ #-------------------------------------------------------# # Evaluation Link: # # http://scv.bu.edu/survey/tutorial_evaluation.html # # or # # rcs.bu.edu/eval # #-------------------------------------------------------#