Step 2: Clean the data

# get rid of metadata and only keep the second element in the list "mydata"
myList <- mydata[[2]]
# count how many elements are in "myList" and store the number into variable "numRows"
numRows <- length(myList)
# If we didn't have nullValue = NA, then we would need to do the following
# Test every elements in "myList" one by one, for each element:
# First, record the element into a temporary variable "record",
# Second, if the original element is NULL, assign NA to "record". If not, keep the original value.
# Third, re-assign the value of "record" to the element in "myList"
#for(i in 1:numRows) {
# record <- myList[[i]]
# record[sapply(record, is.null)] <- NA
# myList[[i]] <- record
#}
# Unlist "myList" and transform it to a dataframe, whose number of rows is the value of "numRows"
df <- data.frame(matrix(unlist(myList), nrow=numRows, byrow=T), stringsAsFactors = FALSE)
# remove the first 8 columns
df <- df[, -1:-8]
# rename the rest of columns
colnames(df) <- c("CASE_NUMBER","BARRACK","ACC_DATE","ACC_TIME","ACC_TIME_CODE","DAY_OF_WEEK","ROAD","INTERSECT_ROAD","DIST_FROM_INTERSECT","DIST_DIRECTION","CITY_NAME","COUNTY_CODE","COUNTY_NAME","VEHICLE_COUNT","PROP_DEST","INJURY","COLLISION_WITH_1","COLLISION_WITH_2")

Step 3: Understand the data using SQL (via SQLDF)

# how many accidents happen on SUNDAY
# Use sql to count how many obersavations meet the criterion that DAY_OF_WEEK is "SUNDAY"
# TRIM remove leading and trailing spaces from a string
sun_acc <- sqldf("select count(DAY_OF_WEEK) from df where TRIM(DAY_OF_WEEK) = 'SUNDAY'")
# Print the result
print(sun_acc)

## count(DAY_OF_WEEK)
## 1 2373

# how many accidents had injuries(might need to remove NAs from the data)
# Use sql to count how many obersavations meet the criterion that INJURY value is "YES"
inj_acc <- sqldf("select count(INJURY) from df where INJURY='YES'")
# Print the result
print(inj_acc)

## count(INJURY)
## 1 6433

# list the injuries by day
# count the number of injuries for each day of the week
list_inj <- sqldf("select DAY_OF_WEEK, count(CASE_NUMBER) as Totalnumber from df where INJURY='YES' GROUP BY TRIM(DAY_OF_WEEK)")
# Print the result
print(list_inj)

# how many accidents had injuries
# apply the length function on the "YES" subset of the column INJURY
tapply(df$INJURY, df$INJURY == 'YES', length)

## FALSE TRUE
## 12204 6433

# list the injuries by day
# apply the length function on subset of the column INJURY broken down by the value in WEEK_OF_DAY as well as INJURY
tapply(df$INJURY=="YES", df$DAY_OF_WEEK, sum, na.rm=TRUE)