Intro to Data Manipulation with R

Hello World,

Here is a recorded version of an in-person training I have been doing.  Enjoy.  I end up coming back to this myself even for reference.

This episode is all about performing data manipulation to derive raw insights from your data using the R programming language.  Data manipulation is the core to anything and everything you do in business intelligence and machine learning.  This episode sets the base for all R based intelligence sessions from here on out.

Part 1: Introduction to Microsoft R Open.

Part 2: Introduction to R Data Structures

Part 3: Data Manipulation with R

Part 4: Beautiful Visualizations with R

Introduction
What is Data Manipulation
dplyr & magrittr
Install Packages
magrittr & pipe operator
reading in a .csv & cleaning names
Feature Manipulation
Observation Manipulation
Group Manipulation
Challenge Exersizes

#########################
#Libraries/Dependencies#
########################
install.packages("magrittr")
install.packages("dplyr")
install.packages("checkpoint")
#Adds %>% forward pipe operator
library("magrittr") 
#adds grouping and manipulations
library("dplyr")
library("checkpoint")
checkpoint("2016-03-29")
#########################
#'%>%' The Pipe Operator#
#########################

mult = function(x,y){ x * y}
#2 * 6 * 5 * 10
#Traditional Way
mult(10, mult(5, mult(6, 2)))
#Piped Method
#output into first parameter of next function
2 %>% 
mult(6) %>%
mult(5) %>%
mult(10)


###################
#Data In & Rename#
##################

#Read Data
m311 <- read.csv("C:\\data\\Miami_311.csv", 
header=TRUE, 
sep=",")
#Show bad Formatting
m311 
#Fix Formatting
m311 <- m311 %>% tbl_df() 
#Summary of Data Set
summary(m311$Case.Owner)
summary(m311)
#Renaming Part 1: Observations with Look Up Table (LUT)
caseOwnerLUT = c("Animal_Services" = "Animal Services",
"Public_Works_Construction-6-60" = "Construction",
"Public_Works_Mosquito_Control-8-60" = "Mosquito Control",
"Public_Works_Traffic_Engineering-10-60" = "Traffic Engineering",
"Public_Works_Waste_Management" = "Waste Management",
"Regulatory_and_Economic_Resources" = "Regulatory and Economic Resources",
"Community_Information_and_Outreach" = "Information and Outreach",
"Public_Works_Hwy_Engineering-5-60" = "Traffic Engineering",
"Public_Works_Road_And_Bridges-16-60" = "Road and Bridges",
"Public_Works_traffic_Signals_And_Signs-15-60" = "Traffic Engineering",
"RAAM-27-93" = "RAAM",
"Waste_Management" = "Waste Management"
) 
m311$Case.Owner = caseOwnerLUT[m311$Case.Owner] %>%
factor()
summary(m311$Case.Owner)
labels(m311)
#Renaming Part 2: Features
m311 = m311 %>% 
rename(CreatedDate = Ticket.Created.Date...Time) %>%
rename(LastUpdatedDate = Ticket.Last.Updated.Date...Time) %>%
rename(ClosedDate = Ticket.Closed.Date...Time) %>%
rename(Address = Location.Geo..Coded) %>%
rename(District = Neighborhood...District...Ward...etc.) %>%
rename(Zip = Zip...Postal.Code) %>%
rename(State = State...Province)

#Sampling
m311f = m311 %>% sample_frac(.1)
m311n = m311 %>% sample_n(1000)


#######################
#Feature Manipulation#
######################

#Select a few columns
m311 %>% 
select(Case.Owner, CreatedDate)

#Exclude a column
m311 %>%
select(-ClosedDate)

#Select with helper
x = m311 %>% 
select(contains("Issue"), Case.Owner)
x
distinct(x)

#Mutate
x = m311 %>%
mutate(
GoalKpi = (Goal.Days - Actual.Completed.Days) / Goal.Days
)
summary(x$GoalKpi) #obviously some issues with this calculation, like divide by zero perhaps?


###########################
#Observation Manipulation#
##########################

goals = m311 %>% 
select(Case.Owner, Issue.Type, Goal.Days, Actual.Completed.Days, Ticket.Status)

zeroGoal = goals %>%
filter(Goal.Days <= 0)
#how many are there?
nrow(zeroGoal)
summary(zeroGoal$Ticket.Status)
#Lets reduce our factors a bit here.
zeroGoal$Issue.Type = zeroGoal$Issue.Type %>% factor()
zeroGoal$Ticket.Status = factor(zeroGoal$Ticket.Status)

zeroGoal %>%
arrange(desc(Actual.Completed.Days)) %>%
glimpse()

zeroGoal$Actual.Completed.Days = as.numeric(zeroGoal$Actual.Completed.Days)

#Check Out Summaries
zeroGoal %>%
summarise(
mean = mean(Actual.Completed.Days, na.rm = TRUE), #easy way
sd = sd(Actual.Completed.Days, na.rm = TRUE),
median = median(Actual.Completed.Days[!is.na(Actual.Completed.Days)]), #hard way
observations = n(),
NasInActualCompleted = sum(is.na(Actual.Completed.Days))
)

#####################
#Group Manipulation#
####################

#A few properties about groups
x = zeroGoal %>%
group_by(Case.Owner)
class(x)
x
#looks like a dataframe

#but you can do grouped operations
zeroGoal %>%
group_by(Case.Owner) %>%
summarise(
obs = n(),
avg.comp.days = mean(Actual.Completed.Days, na.rm = TRUE),
max.comp.days = max(Actual.Completed.Days, na.rm = TRUE),
min.comp.days = min(Actual.Completed.Days, na.rm = TRUE)
) %>%
arrange(avg.comp.days)


#############################
#Practice Exersices - Part 1#
#############################
# For Each Year, What were #
# the top 3 Issues and #
# average resolution time #
############################

#Hints
install.packages("lubridate")
library("lubridate")
summary(m311$CreatedDate)

m311f$CreatedDate2 = as.Date(m311f$CreatedDate,
format="%m/%d/%Y")

year(m311f$CreatedDate2)

m311 %>% 
select(Ticket.ID, Issue.Type, Actual.Completed.Days) %>%
top_n(3, Actual.Completed.Days) %>%
arrange(desc(Actual.Completed.Days))

##################
#Part 1 - Answer#
#################
m311 %>%
mutate(
Year = year(as.Date(CreatedDate, format="%m/%d/%Y"))
) %>%
group_by(Year, Issue.Type) %>%
summarise(
obs = n(),
avgResTime = mean(Actual.Completed.Days, na.rm = TRUE)
) %>%
top_n(3, obs) %>%
arrange(desc(obs))

#############################
#Practice Exersices - Part 2#
#############################
# What are top 3 methods we #
# we will receive gps #
#############################


##################
#Part 2 - Answer#
#################

m311 %>%
group_by(Method.Received) %>%
summarise(
gpsObs = sum((!is.na(Latitude) & !is.na(Longitude))) 
) %>%
top_n(3, gpsObs) %>%
arrange(desc(gpsObs))

##############################
#Practice Exersices - Part 3 #
##############################
# Which Districts service #
# the most requests for each #
# department and what are #
# the top 3 issues #
##############################

##################
#Part 3 - Answer #
##################

m311 %>%
group_by(Case.Owner, District, Issue.Type) %>%
summarise(
issueObs = n()
) %>% 
arrange(desc(issueObs)) %>%
summarise(
totalObs = sum(issueObs),
top_issues = toString(Issue.Type[1:3])
) %>%
top_n(3, totalObs) %>%
arrange(desc(totalObs)) %>% glimpse()

x[6,] %>% glimpse()

Leave a Reply

Your email address will not be published. Required fields are marked *