#---------------------------------------#
#                                       #
#   Data Wrangling in R                 #
#   Research Computing Services         #
#                                       #
#   Katia Bulekova                      #
#                                       #
#---------------------------------------#

#Data Science Workflow

# 1. Import
# 2. Clean (Tidy)
# 3. Understand:
#     - Transform
#     - Visualize
#     - Model
# 4. Communicate


# The tidyverse is an opinionated collection of R packages designed for data science. 
# All packages share an underlying design philosophy, grammar, and data structures. 


# *** Core tidyverse packages ***
# readr - import data
# tidyr - for data "tidying"
# dplyr - data manipulation
# stringr - string manipulation
# forcats -factors handling
# ggplot2 - data visualization
# tibble - modern representation of a data frame
# purrr - functional programming
#
# *** Other related packages ***
# hms - time manipulation
# lubridate - dates/times manimpulation
# DBI - databases
# haven - data imported from SPSS, SAS and Stata 
# readxl - excel data import
# rvest - web scaping

#----------------------------------------------


# install.packages("tidyverse")
library(tidyverse)


# Tidyverse has its own read_csv() function to read csv files.
# Compare:
df1 <- read.csv( "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv")
df2 <- read_csv( "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv")
class(df1)
class(df2)
head(df1)
head(df2)

# Notice the difference between the print() function behavior for df1 and df2:
print(df1)
print(df2)

# To print more rows and/or columns, use arguments:
print(df2, n = 20, width = Inf)


library(readxl)

# Reading data saved in Excel (xslx) format often have some challenges
visit <- read_excel("MedData.xlsx", sheet = "Visits")
visit

# We can see that the file contains some extra records at the top and at the bottom of the sheet.
# Use options to read_excel() function to skip these extra lines:
visit <- read_excel("MedData.xlsx", 
                    sheet = "Visits", 
                    skip = 3,  # how many lines to skip at the top of the sheet
                    n_max = 13)  # how many observations to read
visit


# Let's take a look at the type of each record to make sure that each column was read correctly
# This can be done with str() function.
# However dplyr package has also handy glimpse() function that has a nicely formatted output
str(visit)
glimpse(visit)

# Notice that DBP column is marked as "character" column even though it contains numeric values.
# This normally means that there are some missing values that were coded in non-standard way, so 
# R did not recognize them as missing values.
# In this case you should examine the column and try to identify the missing values:
min(visit$DBP)
#[1] "##N/A"

# Now we can go back and read file again adding "##N/A" as a Missing Data code:
visit <- read_excel("MedData.xlsx", 
                    sheet = "Visits", 
                    skip = 3,
                    n_max = 13,
                    na=c("","NA","##N/A"))
glimpse(visit)


##-------------------------#
##   |> and  %>% "pipes"
##-------------------------

# Native R's pipe symbol |> was introduced in 4.1.0
# It allows to send the output of one function as an input object to the next function:
visit |> glimpse()


# You may see another "pipe" symbol used much more frequently.
# %>% "pipe" symbol was introduced in magrittr package long time ago and 
# then it was adopted by all other packages in tidyverse library
visit %>% glimpse()
visit %>% summary()

# The following chain of functions:
sort(unique(visit$`Patient ID`), decreasing=TRUE)

# can be rewritten as
visit$`Patient ID` %>% 
  unique()  %>%            # find unique values
  sort(decreasing=TRUE)    # sort  in descending order


#-------------------------#
#    exercise
#-------------------------
# Read and explore "Patient Data" sheet from the same excel file
# pinfo <- read_excel(-----)

#  Use glimpse, head, summary, and other functions to explore the dataset


#-------------------------------------------------------------------------


#---------------------------------------------------------------
## A dataset is a collection of values, usually either 
# - numbers (if quantitative),  
# - strings (if qualitative),
# - logical (if binary).
# Every value belongs to a variable and an observation. 
# A variable contains all values that measure the same underlying attribute (like phone, age) across units. 
# An observation contains all values measured on the same unit (like a person, or a day, or an event) across attributes

## Clean or Tidy Data:
# - Each variable forms a column
# - Each observation forms a row
# Each type of observational unit forms a table

## Examples of messy datasets
# - Column headers are values, not variable names
# - Multiple variables are stored in one column
# - Variables stored in both rows and columns
# - Multiple types of observational units are stored in the same table
# - A single observational unit is stored in multiple tables.


#---------------------------------------------------------------------------

#
# Using dplyr package to 

# - Rename variables ( rename() )
# - Filter observations by their values ( filter() )
# - Reorder rows ( arrange() )
# - Select specific columns/variables ( select() )
# - Create new variables with functions of existing variables ( mutate() )
# - Summarise (summarise() )


# We can select columns in a Data Frame using "$" symbol, i.e:
visit$DBP

# It is not as easy to do so when the column name contains spaces or other special characters:
visit$`Patient ID`

# In some cases in makes sense to rename some columns to make it easy to work 
visit.clean <- visit %>%
  rename(id="Patient ID", 
         admission="Admission date",
         discharge="Discharge date",
         pulse="Heart Rate")


#---------------------------------------------
# Working with strings
#---------------------------------------------

# Converting strings to upper and lower case
toupper(visit.clean$Allergies)

# or using pipe operator
visit.clean$Allergies %>% toupper()

#Search character vector for a specific pattern:
grepl ("pain", visit.clean$Symptoms,  ignore.case=TRUE)

# Find all strings that contain either fever or pain (or both)
grepl ("fever|pain", visit.clean$Symptoms,  ignore.case=TRUE)


# grep( value=FALSE): returns a vector of indices of element where pattern is found
# grep( value=TRUE): returns a vector of indices of element where pattern is found
grep ("fever", visit.clean$Symptoms,  ignore.case=TRUE, value=FALSE)
grep ("fever", visit.clean$Symptoms,  ignore.case=TRUE, value=TRUE)


#------------------------------------------
# Working with dates
#------------------------------------------
class(visit.clean$admission)

curr.time <- Sys.time()  # get current date and time
curr.date <- Sys.Date()  # get current date 
str(curr.time)           # view the structure of an object
class(curr.time)         # view type of an object

# some systems do not have timezone set up
Sys.timezone()

# convert character string to POSIXlt
t1=as.POSIXct("2019-01-29 11:30:00", "%Y-%m-%d %H:%M:%S", tz="EST")
OlsonNames()  # list of Time Zones
str(t1)           


library(lubridate)
mydates <- c("2/25/2018", "3/5/2017", "4-18-2018", "7.5.2017")
class(mydates)
newdates <- mdy(mydates)
class(newdates)


# There are a number of handy packages that have various functions to work with dates:
# lubridate
# chron

# Some more examples working with dates and times can be found:
# http://rcs.bu.edu/examples/r/timesSeries/dateTime.R

# Here we will calculate the length of stay of each patient in the hospital
visit.clean$discharge - visit.clean$admission
#The result variable has "difftime" class:
stay <- visit.clean$discharge - visit.clean$admission
class(stay)
#To convert it to a numeric value use as.numeric() function:
as.numeric(visit.clean$discharge - visit.clean$admission)


# ********************************************************************************


#----------------------
#Filtering rows
#----------------------
visit.clean %>% filter( !is.na(DBP) )

## Filter observations
# R logical operators
# ? Comparoson
# >  >=
# ==  !=
# <   <=
# is.na   !is.na
#  %in%


## ************
#   Exercise:
## ************
# Select only those records for which pulse columns have values 100 and greater
# visit.clean %>% filter( --- )

# Select only those records for which DBP is less than 60 or SBP is greater than 120
# visit.clean %>% filter( --- | ---)

# Select only those records for which Temperature is greater than 99 and Symptoms include "fever"
# visit.clean %>% filter( Temperature --- &  grepl("---", Symptoms, ignore.case=T)   )

#***********


#----------------------
# Selecting Columns
#----------------------
visit.clean %>% select(id, Temperature:pulse)

# dplyr comes with a set of helper functions that can help you select groups of variables inside a select() call:
#    starts_with("XY"): every name that starts with "XY",
#    ends_with("XY"): every name that ends with "XY",
#    contains("XY"): every name that contains "XY",
#    matches("XY"): every name that matches "XY", where "XY" can be a regular expression,
#    num_range("x", 1:15): the variables named x01, x02, x03,..., x15,
#    one_of(XY): every name that appears in x, where XY is a character vector.

visit.clean %>% select( id, ends_with("BP") )


# Another useful function to specify the order of the columns is "relocate"
visit.clean %>% relocate(Temperature, .after = pulse)

#---------------------------------------------------
# Modifying existing and/or creating  new variables
#---------------------------------------------------
visit.clean %>% mutate( Temperature = (Temperature-32)*5/9, stay = as.numeric(discharge - admission) )

## ************
#   Exercise:
## ************

# Create a new column MAP which is equal to SBP/3 + 2*DBP/3
#visit.clean %>% mutate( --- ) 

# Let's put it all together:
# Use visit.clean dataframe as input and
# - select only those columns where Temperature is greater than 99F
# - select columns ID, DBP and SBP
# - calculate new variable MAP
#res <- visit.clean %>%
#            filter( --- ) %>%
#            select( --- ) %>%
#            mutate( --- ) 
# res


#---------------------------------------------------
#  Calculating summaries
#---------------------------------------------------
visit.clean %>% summarise( NumVisits = n(), 
                           max.t = max(Temperature), 
                           min.T = min(Temperature) )

# built-in functions often used within summarise()
# averages: mean(), median()
# spread: sd(), IQR(), mad()
# range: min(), max()
# count: n(), n_distinct()

## ************
#   Exercise:
## ************

# Calculate the number of distinct patients
# visit.clean %>% summarise( N = --- ) 


#---------------------------------------------------
#  Sorting dataframe by one or more variables
#---------------------------------------------------
visit.clean %>% arrange(id, admission)

# sort in descending order
visit.clean %>% arrange(id, desc(admission))


#---------------------------------------------------
#  Group by one or more variables
#---------------------------------------------------
visit.clean %>% group_by(id) %>% summarise( ave.pulse = mean(pulse) )


# Useful functions often used with group_by()
# first(), last(), nth()

# For each patient select the first record and find the length of stay
visit.clean %>% 
  group_by(id) %>% 
  arrange(admission) %>%
  summarise( first(discharge - admission) )


#---------------------------------------------------
#  Joining 2 dataframes
#---------------------------------------------------


# First let's read the data for each patient:
pinfo <- read_excel("MedData.xlsx", sheet = "Patient Data")

#let's make sure the date is well formatted
pinfo %>% head()
pinfo %>% glimpse()
pinfo %>% summary()

# Now we want to have a single dataframe that contains patient information and patient visit information
# There are a number of join* functions in dplyr package:
# inner_join
# left_join
# right_join
# full_join
# semi_join  - return all rows from x where there are matching values in y, keeping just columns from x.
# anti_join  - return all rows from x where there are not matching values in y, keeping just columns from x.

#Let's try full join of both dataframes we have
full_join(visit.clean, pinfo, by = c("id"="ID"))

# left join
result <- visit.clean  %>% left_join(pinfo, by = c("id"="ID"))
result

## ******************
#    Exercise
## ******************

# Calculate the mean stay of the hospital for Male and Female patients:
# result %>%
#   filter( --- ) %>%            # select observations where Gender is not missing
#   mutate( stay = --- ) %>%     # create a new variable stay equal to the length of stay in the hospital
#   group_by ( --- ) %>%         # group by gender
#   summarize( ave.stay = --- )  # calculate mean length of stay
  

## ************************************************************
## Converting Wide dataframes to long dataframes and back
## *************************************************************
library(tidyr)
city.temps <- data.frame( time = as.Date('2018-09-03') + 0:4,
                          Boston = rnorm( 5, 75, 3),
                          Denver = rnorm( 5, 85, 5),
                          SanFrancisco = rnorm(5, 80, 5),
                          Austin = rnorm( 5, 90, 5) )
city.temps

# Use gather to rearrange a table into a tidy data.frame:
# gather(data, key = "key", value = "value", ..., na.rm = FALSE,
# convert = FALSE, factor_key = FALSE)
city.temps2 <- pivot_longer( city.temps, 
                             Boston:Austin, 
                             names_to = "City", values_to = "Temperature" )

# or
city.temps2 <- pivot_longer( city.temps, 
                             !time, 
                             names_to = "City", values_to = "Temperature" )


city.temps2
glimpse(city.temps2)


#  Sometimes it is useful to be able to
# perform the opposite operation: convert long format dataframe into wide representation
city.temps3 <- pivot_wider( city.temps2, names_from = "City", values_from = "Temperature")
city.temps3


## ----------------------------------- 
## Additional resources:
## ----------------------------------- 

# 1. R for applied epidemiology and public health: https://epirhandbook.com/en/
# 2. R for data science:                           https://r4ds.had.co.nz/


#-------------------------------------------------------#
#  Evaluation Link:                                     #
#  http://scv.bu.edu/survey/tutorial_evaluation.html    #
#  or                                                   #
#  rcs.bu.edu/eval                                      #
#-------------------------------------------------------#