##
## =============  Input and Output of Datasets ==============
##
## R Tutorial
## Katia Bulekova, 2015, 2019, 2020
## Research Computing
## Boston University
##
## =======================================================

## ***********************
##    Read CSV file
## ***********************

# Read simple comma delimited file (csv) - assumes presence of the header
dataset <- read.csv("http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv")


### PERFORMANCE TIP ###
# nrows and colClasses argument:
# If the dataset is large, provide columns types and record number if possible
# a mild overestimate is possible for record number
# For this dataset the time is cut almost twice!

# *** na.strings ***
# Use na.strings argument to specify values that should be converted to NA (missing values)

## ***********************
# Advanced packages to load datasets 
# This finctions are often much faster for large datasets, but might require
# some extra settings to deal with poorly formatted input files
# Each package below performs significantly faster than the standard read.* functions and
# noramlly is easier to use, but might fail for some non-standard input formats
## ***********************

require(readr)
dataset <- read_csv( file = "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv" )

require(data.table)
dataset <- fread ( input = "http://scv.bu.edu/examples/r/tutorials/Datasets/Salaries.csv" )


## **********************************
##    Read TXT file (tab delimeted)
## **********************************

# Read simple tab delimited file (txt) - assumes presence of the header
# Unlike csv read.table assumes no header by default
# we also have to specify the delimeter charaster (space - by default)
dataset <- read.table("your_file_name.txt",
                      header = TRUE,
                      sep = "\t")

# In this example we would like to leave Sex variable as factor, but 
#     Name variable should be set as character
dataset <- read.table("your_file_name.txt",
                      header = TRUE,
                      sep = "\t",
                      colClasses = c("integer","character",rep("integer",3),"factor"))

### PERFORMANCE TIP ###
# read.table will work faster for large files if comment.char is set as ""
dataset <- read.table("your_file_name.txt",
                      header = TRUE,
                      sep = "\t",
                      colClasses = c("integer","character",rep("integer",3),"factor"),
                      comment.char = "")

# In some cases file might contain a few lines that would want to skip
# Here is an example of the file that does not have a header, but has a few lines
# explaining the dataset
# This file also contains some blank line we would like to skip
dataset <- read.table("your_file_name.txt",
                      header = FALSE,
                      sep = "\t",
                      colClasses = c("integer","character",rep("integer",3),"factor"),
                      comment.char = "",
                      skip = 8,
                      blank.lines.skip = TRUE)
# set column names
names(dataset) <- c("ID","Name","Age","Height","Weight","Gender")

## *********************************
##    Read TXT file (fixed width)
## *********************************
dataset <- read.fwf("your_file_name.txt",
                      widths = c(12,19,8,11,11,6),
                      col.names = c("ID","Name","Age","Height","Weight","Gender"))


## ************************************
## Read zip files 
## ************************************

df <- read.table(unz("flights.zip", "your_file_name.csv"), header=TRUE, sep=",")

fdf1 <- read.table(gzfile("your_file_name.csv.gz"), header=TRUE, sep=",")

## ***************************
##    Read Excel files 
## ***************************
# There are a few R packages that can read excel files
library(readxl)
dataset <- read_excel("your_file_name.xlsx", sheet = 1)

# If there are an extral lines at the top of the file that you need to skip - use "skip" argument
# Below is an example how to read an excel file, skip the first 3 lines and read 13 lines, speicyfing 
# ##N/A symbol to be treated as a missing value:
visit <- read_excel("your_file_name.xlsx", 
                    sheet = "Visits", 
                    skip = 3,
                    n_max = 13,
                    na=c("","NA","##N/A"))


# Excel (and other sources) data can also be read from a Clipboard:
# Open Excel file, select and copy the cells and then run the code


# here we assume the first record is a header
clip <- readClipboard()
clip <- strsplit(clip, "\t")
dataset <- as.data.frame(do.call(rbind, clip[-1]), stringsAsFactors=FALSE)
names(dataset) <- clip[[1]]

# if no header is present in the selected cells
clip <- readClipboard()
clip <- strsplit(clip, "\t")
dataset <- as.data.frame(do.call(rbind, clip), stringsAsFactors=FALSE)

## ***************************
##    Read SAS files 
## ***************************
# Libraries to read SAS files:
#   - haven
#   - sas7bdat 

library("haven")
df <- read_sas("MyFile.sas7bdat")

library("sas7bdat")
dataset <- read.sas7bdat("MyFile.sas7bdat")


## ***************************
##    Read SPSS files 
## ***************************
# Libraries to read SPSS files:
#   - foreign (read.spss() function)
#   - Hmisc (spss.get() function)
library("foreign")
dataset <- read.spss("Datasets/gapminder.sav",
                    use.value.labels = TRUE,
                    to.data.frame    = TRUE)

## ***************************
##    Read Stata files 
## ***************************
library("foreign")
dataset <- read.dta("Datasets/gapminder.dta")


## ***********************
##    Write to  CSV file
## ***********************

# Read simple comma delimited file (csv) - assumes presence of the header
write.csv( dataset,
           file = "myDataset.csv")

# Supress writting row names into output file
write.csv( dataset,
           file = "myDataset.csv",
           row.names=FALSE)


## ***********************
##    Write to  txt file
## ***********************

write.table(dataset,
            file = "myDataset.txt",
            quote = FALSE, # do not use quotes aorund string variables
            sep = "\t",    # use tab as a separator
            na  = " ",     # output missing values with a space
            row.names = TRUE,
            col.names = TRUE)