Fixed-site public bicycles, while providing convenience to citizens on one hand, also face a shortage of vehicles at the docks or a surplus of vehicles. In this spatial analysis, we use open data from Philadelphia’s IndegoBike to predict the demand for bicycle sharing at all docking points to ensure that bicycles or docking points are available when needed.
As a cycling friendly city, Philadelphia’s busy public transportation makes the best sample for studying public bike scheduling. The data for this study uses IndegoBike open data from May and June, and from these data 20-24 weeks were selected for subsequent analysis.
library(tidyverse)
library(sf)
library(lubridate)
library(tigris)
library(tidycensus)
library(viridis)
library(riem)
library(gridExtra)
library(knitr)
library(kableExtra)
library(RSocrata)
library(ggplot2)
library(osmdata)
library(gganimate)
library(FNN)
library(grid)
library(rjson)
library(gifski)
options(tigris_class = "sf")
root.dir =
paste0("https://raw.githubusercontent.com/urbanSpatial",
"/Public-Policy-Analytics-Landing/master/DATA/")
source(
paste0("https://raw.githubusercontent.com/urbanSpatial/",
"Public-Policy-Analytics-Landing/master/functions.r"))
palette5 <- c("#f9b294","#f2727f","#c06c86","#6d5c7e","#315d7f")
palette6 <- c("#eff3ff","#bdd7e7","#6baed6","#3182bd","#08519c")
palette4 <- c("#f9b294","#f2727f","#c06c86","#6d5c7e")
palette2 <- c("#f9b294","#f2727f")
palette1_main <- "#F2727F"
palette1_assist <- '#F9B294'
mapTheme <- function(base_size = 35, title_size = 45) {
theme(
text = element_text( color = "black"),
plot.title = element_text(hjust = 0.5, size = title_size, colour = "black",face = "bold"),
plot.subtitle = element_text(hjust = 0.5,size=base_size,face="italic"),
plot.caption = element_text(size=base_size,hjust=0),
axis.ticks = element_blank(),
panel.spacing = unit(6, 'lines'),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=base_size),
axis.title = element_text(face = "bold",size=base_size),
axis.text = element_blank(),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(size=base_size,colour = "black", face = "italic"),
legend.text = element_text(size=base_size,colour = "black", face = "italic"),
strip.text.x = element_text(size = base_size,face = "bold")
)
}
plotTheme <- function(base_size = 35, title_size = 45) {
theme(
text = element_text( color = "black"),
plot.title = element_text(hjust = 0.5, size = title_size, colour = "black",face = "bold"),
plot.subtitle = element_text(hjust = 0.5,size=base_size,face="italic"),
plot.caption = element_text(size=base_size,hjust=0),
axis.ticks = element_blank(),
panel.spacing = unit(6, 'lines'),
panel.background = element_blank(),
panel.grid.major = element_line("grey80", size = 0.1),
panel.grid.minor = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=6),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=base_size),
axis.title = element_text(face = "bold",size=base_size),
axis.text = element_text(size=base_size),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(size=base_size,colour = "black", face = "italic"),
legend.text = element_text(size=base_size,colour = "black", face = "italic"),
strip.text.x = element_text(size = base_size,face = "bold")
)
}
Download the data of season 2 from the IndegoBike website, we can use the substr function to extract the first 12 characters of the string “started_at”. For example, “5-01-2022 21:08:07” will be transfer to “5-01-2022 21”. Then we can use the mdy_h function to caculate the week and the time.
rideRaw <- read.csv("D:/Upenn/Upenn Lec/05-MUSA-508/Assign-06/HW/indego-trips-2022-q2/indego-trips-2022-q2.csv")
ride <- rideRaw %>%
mutate(interval_hour =mdy_h(substr(start_time, 1, 12)),
week = week(interval_hour),
dayotw = wday(interval_hour, label=TRUE),
start_station_name = as.character(start_station),
end_station_name = as.character(end_station))%>%
na.omit() %>%
filter(week %in% c(20:24))
census_api_key("155cc525674a0d27c98afbb7030d0802e9bb5543", overwrite = TRUE)
PHICensus <-
get_acs(geography = "tract",
variables = c("B01003_001", "B19013_001",
"B02001_002", "B08013_001",
"B08012_001", "B08301_001",
"B08301_010", "B01002_001"),
year = 2017,
state = "PA",
geometry = TRUE,
county=c("Philadelphia"),
output = "wide") %>%
rename(Total_Pop = B01003_001E,
Med_Inc = B19013_001E,
Med_Age = B01002_001E,
White_Pop = B02001_002E,
Travel_Time = B08013_001E,
Num_Commuters = B08012_001E,
Means_of_Transport = B08301_001E,
Total_Public_Trans = B08301_010E) %>%
select(Total_Pop, Med_Inc, White_Pop, Travel_Time,
Means_of_Transport, Total_Public_Trans,
Med_Age,
GEOID, geometry) %>%
mutate(Percent_White = White_Pop / Total_Pop,
Mean_Commute_Time = Travel_Time / Total_Public_Trans,
Percent_Taking_Public_Trans = Total_Public_Trans / Means_of_Transport)
PHITracts <-
PHICensus %>%
as.data.frame() %>%
distinct(GEOID, .keep_all = TRUE) %>%
select(GEOID, geometry) %>%
st_sf
CensusData <-
get_acs(geography = "tract",
variables = c("B01003_001E"),
year=2019, state="PA", county="Philadelphia",geometry = T, output="wide") %>%
st_transform(st_crs(PHITracts)) %>%
rename(Census_TotalPop = B01003_001E) %>%
dplyr::select(-NAME,-starts_with("B")) %>%
mutate(Census_areasqm = as.numeric(st_area(.)),
Census_PopDensity = Census_TotalPop/Census_areasqm) %>%
dplyr::select(-Census_TotalPop,-Census_areasqm)%>%
replace(is.na(.),0)
##st_write(CensusData, "CensusData.geojson")
ride_rsch <- ride%>%
group_by(start_station_name)%>%
summarize(lat=first(start_lat),lng=first(start_lon))
PHITracts <-
tigris::tracts(state = "PA", county = "Philadelphia") %>%
dplyr::select(GEOID)%>%
filter(GEOID!="36061000100")
ride2_sf = st_as_sf(ride_rsch, coords = c("lng", "lat"),
crs = st_crs(PHITracts), agr = "constant")
ride_station <- st_join(ride2_sf,PHITracts,left=TRUE)%>%na.omit()
ride_tract <- st_join(PHITracts,ride2_sf,left=TRUE)%>%na.omit()
station_list <- ride_station[["start_station_name"]]
ride3 <- ride %>%
filter(start_station_name %in% station_list,end_station_name %in% station_list)
Using the data obtained from the above steps, we can get the location information of all Indego stations in the Philadelphia.
ggplot()+
geom_sf(data=st_union(PHITracts),color="black",size=15,fill = "transparent")+
geom_sf(data=PHITracts,color="black",size=0.5,linetype ="dashed",fill = "transparent")+
geom_sf(data=ride_station,size=4,color=palette1_main)+
labs(title = "Map of Share Bike Stations, Philadelphia",
subtitle = "Red dots are stations")+
mapTheme()
study.panel <-expand.grid(
interval_hour = unique(ride3$interval_hour),
start_station_name =
unique(ride3$start_station_name))
ride3.panel <-ride3 %>%
mutate(Trip_Counter = 1) %>%
group_by(interval_hour, start_station_name) %>%
summarize(Trip_Count = sum(Trip_Counter, na.rm=T)) %>%
right_join(study.panel) %>%
replace(is.na(.),0) %>%
left_join(ride_station,by = "start_station_name") %>%
mutate(week = week(interval_hour),dayotw = wday(interval_hour, label = TRUE)) %>%
st_sf()
weather.Data <-
riem_measures(station = "PHL", date_start = "2022-05-01",
date_end = "2022-07-01")
weather.Panel <-
weather.Data %>%
mutate_if(is.character,
list(~replace(as.character(.), is.na(.), "0"))) %>%
replace(is.na(.), 0) %>%
mutate(interval_hour = ymd_h(substr(valid, 1, 13))) %>%
mutate(week = week(interval_hour),
dayotw = wday(interval_hour, label=TRUE)) %>%
filter(week %in% c(20:24))%>%
group_by(interval_hour,week,dayotw) %>%
summarize(Temp = max(tmpf),
Precip = sum(p01i),
Wind = max(sknt)) %>%
mutate(Temp = ifelse(Temp == 0, 42, Temp))%>%
replace(is.na(.),0)
After importing the bicycle data, we can continue to introduce some features related to bicycle usage for analysis. The first is the weather feature, which we can easily understand from life experience that people use bicycles less frequently in rainy days due to safety concerns. The usage rate is also lower in low temperatures. Therefore, we introduce three weather features: precipitation, wind and temperature, to analyze the correlation between weather and bicycle use.
grid.arrange(
ggplot(weather.Panel, aes(interval_hour,Precip)) +
geom_line(color=palette1_main,size=2)+
labs(x="",
title = "Weather Data - PHI - Week 20 - Week 24, 2022")+
plotTheme(),
ggplot(weather.Panel, aes(interval_hour,Wind)) +
geom_line(color=palette1_main,size=2)+
labs(x="",
title = "")+
plotTheme(),
ggplot(weather.Panel, aes(interval_hour,Temp)) +
geom_line(color=palette1_main,size=2)+
labs(x="Time",
title = "")+
plotTheme())
Similarly, the population base is also related to the proximity of public bike stations to a significant degree. Densely populated neighborhoods may require more frequent dispatching, which should also be taken into account.
ggplot()+
geom_sf(data=st_union(PHITracts),color="black",size=10,fill = "transparent")+
geom_sf(data=CensusData,aes(fill=q5(Census_PopDensity)))+
geom_sf(data=ride_station,color="red")+
scale_fill_manual(values = palette5,
labels = qBr(CensusData, "Census_PopDensity"),
name = "Per1000sqm")+
labs(title = "Map of Population Density, Philadelphia",
subtitle = "Population Per 1000sqm")+
mapTheme()
ride4 <-ride3.panel%>%
left_join(weather.Panel%>%dplyr::select(-dayotw), by = "interval_hour")%>%
rename("week"="week.x")%>%
dplyr::select(-week.y)
In this section, we will analyze the data for the previously screened five weeks.
Here, we use the data of the first three weeks (20-24) as the data of the training set and the data of the last two weeks (20-24) as the data of the test set.
ride5 <-
ride4 %>%
arrange(start_station_name, interval_hour) %>%
group_by(start_station_name) %>%
mutate(lagHour = dplyr::lag(Trip_Count,1),
lag2Hours = dplyr::lag(Trip_Count,2),
lag3Hours = dplyr::lag(Trip_Count,3),
lag4Hours = dplyr::lag(Trip_Count,4),
lag12Hours = dplyr::lag(Trip_Count,12),
lag1day = dplyr::lag(Trip_Count,24)) %>%
ungroup()
ride5 <-ride5%>%
left_join(CensusData%>%st_drop_geometry()%>%dplyr::select(GEOID,Census_PopDensity), by = "GEOID")%>%
mutate(isPrecip = ifelse(Precip > 0,"Rain/Snow", "None"))
ride.Train <- filter(ride5, week < 23)
ride.Test <- filter(ride5, week >= 23)
ride.cross <- ride5
Time Lag implies that the cycling situation at some point in the future is correlated with the cycling situation at some time in the past. From the results, most weeks show very comparable time series patterns with consistent peaks and valleys. This indicates the existence of serial correlation.
Fridays <-
mutate(ride5,friday = ifelse(dayotw == "Fri" & hour(interval_hour) == 1,interval_hour, 0)) %>%
filter(friday != 0)
rbind(
mutate(ride.Train, Legend = "Training"),
mutate(ride.Test, Legend = "Testing")) %>%
group_by(Legend, interval_hour) %>%
summarize(Trip_Count = sum(Trip_Count)) %>%
ungroup() %>%
ggplot(aes(interval_hour, Trip_Count, colour = Legend)) +
geom_line(size=2) +
scale_colour_manual(values = palette2) +
geom_vline(data = Fridays, aes(xintercept = interval_hour),linetype = "dashed",size=1) +
labs(
title="Rideshare trips by week: week20- week24",
subtitle="Dashed lines for every Friday",
x="Day", y="Trip Count") +
plotTheme()+ theme(panel.grid.major = element_blank())
Spatial autocorrelation means that the bicycle situation at a point is correlated with the surrounding bicycle situation. In this figure, we can see constant spatial clustering on a weekly basis. The higher frequency of use occurs in the center city area, an area that contains relatively high population density, which is also consistent with our common sense.
ride5_sf = st_as_sf(ride5, sf_column_name = "geometry" ,
crs = st_crs(NYCTracts), agr = "constant")
group_by(ride5_sf, week, start_station_name) %>%
summarize(Sum_Trip_Count = sum(Trip_Count)) %>%
ungroup() %>%
ggplot() +
geom_sf(data=PHITracts,color="grey50",size=0.5,fill = "transparent")+
geom_sf(aes(color = q5(Sum_Trip_Count)),size=2) +
geom_sf(data=st_union(PHITracts),color="black",size=3,fill = "transparent")+
facet_wrap(~week, ncol = 5) +
scale_color_manual(values = palette5,
labels = c("10","196","529","826","1283"),
name = "Trip_Count") +
labs(title="Sum of rideshare trips by tract and week") +
mapTheme(25,35) +
theme(legend.position = "bottom",
panel.border = element_rect(color = "black",fill = "transparent", size = 6))
test <- group_by(ride5_sf,dayotw,start_station_name) %>%
summarize(Sum_Trip_Count = sum(Trip_Count)) %>%
ungroup()
ggplot() +
geom_sf(data=PHITracts,color="grey50",size=0.5,fill = "transparent")+
geom_sf(data=test,aes(color = q5(Sum_Trip_Count)),size=1.25) +
geom_sf(data=st_union(PHITracts),color="black",size=2,fill = "transparent")+
facet_wrap(~dayotw, ncol = 7) +
scale_color_manual(values = palette5,
labels = c("12","140","375","596","916"),
name = "Trip_Count") +
labs(title="Sum of rideshare trips by tract and week") +
mapTheme(25,35) +
theme(legend.position = "bottom",
panel.border = element_rect(color = "black",fill = "transparent", size = 6))
Below is a graphical representation of the evolution of bicycle trips versus time using animation. The trips are divided into 5 categories for each station. From the evolving graph we can see that the impact of Time Lag is quite evident during the peak hours of the city.
ride.animation.data <- ride5_sf%>%
filter(week == 20)
ride.animation.data2 <-ride.animation.data%>%
mutate(Trips =
case_when(
Trip_Count == 0 ~ "0 trips",
Trip_Count > 0 & Trip_Count <= 3 ~ "1-3 trips",
Trip_Count > 3 & Trip_Count <= 11 ~ "4-11 trips",
Trip_Count > 11 & Trip_Count <= 28 ~ "12-28 trips",
Trip_Count > 28 ~ "28+ trips")) %>%
mutate(Trips = fct_relevel(Trips,"0 trips","1-3 trips","4-11 trips","12-28 trips","28+ trips"))
mapTheme2 <- function(base_size = 12, title_size = 15) {
theme(
text = element_text( color = "black"),
plot.title = element_text(hjust = 0.5, size = title_size, colour = "black",face = "bold"),
plot.subtitle = element_text(hjust = 0.5,size=base_size,face="italic"),
plot.caption = element_text(size=base_size,hjust=0),
axis.ticks = element_blank(),
panel.spacing = unit(6, 'lines'),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
strip.background = element_rect(fill = "grey80", color = "white"),
strip.text = element_text(size=base_size),
axis.title = element_text(face = "bold",size=base_size),
axis.text = element_blank(),
plot.background = element_blank(),
legend.background = element_blank(),
legend.title = element_text(size=base_size,colour = "black", face = "italic"),
legend.text = element_text(size=base_size,colour = "black", face = "italic"),
strip.text.x = element_text(size = base_size,face = "bold")
)
}
rideshare_animation <-ggplot() +
geom_sf(data=PHITracts,color="grey80",size=0.01,fill = "transparent")+
geom_sf(data = ride.animation.data2, aes(color = Trips),size=1.32) +
geom_sf(data=st_union(PHITracts),color="black",size=1.2,fill = "transparent")+
scale_color_manual(values = palette5) +
labs(title ="Rideshare pickups for week 36, Philadelphia",
subtitle = "One hour intervals: {current_frame}") +
transition_manual(interval_hour) +
mapTheme2() +
theme(legend.position = "bottom",
panel.border = element_rect(color = "black",fill = "transparent", size = 1))
animate(rideshare_animation, duration=10,renderer = gifski_renderer()) %>% knitr::include_graphics ()
In this section, five different Linear regressions are estimated with different fixed effects:
rer 0 focus on just additional features
reg1 focuses on just time, including additional features
reg2 focuses on just space fixed effects, including additional features
reg3 focuses on time, space fixed effects and additional features
reg4 adds the time lag features.
Time features like hour is set as a continuous feature, the interpretation is that a 1 hour increase is associated with an estimated change in Trip_Count. Spatial fixed effects for start_station_name are also included to account for the across tract differences, like amenities, access to transit, distance to the Loop, etc.
Ordinary least squares (OLS) is chosen, and the assumptions to the OLS Regression are met here.
#no time, no spatial
reg0 <- lm(Trip_Count ~dayotw + Temp + isPrecip + Wind,data=ride.Train)
#yes time, no spatial
reg1 <- lm(Trip_Count ~hour(interval_hour) + dayotw + Temp + isPrecip + Wind ,data=ride.Train)
#no time, yes spatial
reg2 <- lm(Trip_Count ~start_station_name + dayotw + Temp + isPrecip + Wind ,data=ride.Train)
#yes time, yes spatial
reg3 <- lm(Trip_Count ~start_station_name + hour(interval_hour) + dayotw + Temp + isPrecip + Wind ,data=ride.Train)
#yes time&lag, yes spatial
reg4 <- lm(Trip_Count ~start_station_name + hour(interval_hour) + dayotw + Temp + isPrecip + Wind + lagHour + lag2Hours + lag3Hours + lag4Hours +lag12Hours + lag1day,
data=ride.Train)
ride.Test.weekNest <-as.data.frame(ride.Test) %>%nest(-week)
model_pred <- function(dat, fit){pred <- predict(fit, newdata = dat)}
week_predictions <-ride.Test.weekNest %>%
mutate(
A_Baseline = map(.x = data, fit = reg0, .f = model_pred),
B_Time_FE = map(.x = data, fit = reg1, .f = model_pred),
C_Space_FE = map(.x = data, fit = reg2, .f = model_pred),
D_Space_Time_FE = map(.x = data, fit = reg3,.f = model_pred),
E_Space_Time_Lags = map(.x = data, fit = reg4,.f = model_pred))
First, we calculated the (MAE) for the five models. This data provides a visual measure of the accuracy of the models. From the icons, we can see that as my chosen model becomes more and more complex, the more predictors are added, the better the accuracy of the model. The addition of time lag significantly improves the accuracy of the model. The time and space effects also have a positive impact on the prediction of the model.
week_predictions <- week_predictions %>%
gather(Regression, Prediction, -data, -week) %>%
mutate(Observed = map(data, pull, Trip_Count),
Absolute_Error = map2(Observed, Prediction,~ abs(.x - .y)),
MAE = map_dbl(Absolute_Error, mean),
sd_AE = map_dbl(Absolute_Error, sd))
week_predictions2 <- week_predictions
week_predictions2$week <- as.character(week_predictions2$week)
week_predictions2 %>%
dplyr::select(week, Regression, MAE) %>%
gather(Variable, MAE, -Regression, -week) %>%
ggplot(aes(week, MAE)) +
geom_bar(aes(fill = Regression),
position = "dodge", stat="identity") +
scale_fill_manual(values = palette5) +
labs(title = "Mean Absolute Errors by model and week") +
plotTheme()+
theme(legend.position = "bottom")
Trips are used as vertical coordinates. As the model becomes more complex, the prediction results become closer to the actual results, and the addition of the time lag makes the model closer to the actual situation for each hour of the day.
week_predictions2 %>%
mutate(interval_hour = map(data, pull, interval_hour),
start_station_name =map(data, pull, start_station_name)) %>%
dplyr::select(interval_hour, start_station_name,
Observed, Prediction, Regression) %>%
unnest() %>%
gather(Variable, Value, -Regression, -interval_hour,-start_station_name) %>%
group_by(Regression, Variable, interval_hour) %>%
summarize(Value = mean(Value)) %>%
ggplot(aes(interval_hour, Value, colour=Variable)) +
geom_line(size = 3) +
facet_wrap(~Regression, ncol=1) +
scale_colour_manual(values = palette2) +
labs(
title = "Mean Predicted/Observed rideshare by hour",
x = "Hour", y= "Rideshare Trips") +
plotTheme()+
theme(legend.position = "bottom")
So how do these models perform in predicting bike use at each site? We calculated the MAE for each station using the data from weeks 23 and 24 as the test set. It is easy to see from the results that for all models, the accuracy is more accurate for the edge areas compared to the central city. However, the addition of the space feature of time lag, etc., makes the prediction of the peripheral range of Philadelphia more accurate. center city’s case still has a higher MAE as the model becomes more complex.
error.byWeek <-
filter(week_predictions2, Regression != "A_Baseline") %>%
unnest() %>%
st_sf() %>%
dplyr::select(start_station_name, Absolute_Error,week, geometry,Regression) %>%
gather(Variable, Value, -start_station_name,-week, -geometry,-Regression) %>%
group_by(Variable, start_station_name, week,Regression) %>%
summarize(MAE = mean(Value))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(data, Prediction, Observed, Absolute_Error)`
## `summarise()` has grouped output by 'Variable', 'start_station_name', 'week'.
## You can override using the `.groups` argument.
q4 <- function(variable) {as.factor(ntile(variable, 4))}
ggplot() +
geom_sf(data=st_union(PHITracts),color="black",size=3,fill = "transparent")+
geom_sf(data=PHITracts,color="black",size=0.5,linetype ="dashed",fill = "transparent")+
geom_sf(data = error.byWeek, aes(color = q4(MAE)), size = 2) +
scale_color_manual(values=palette5,
labels = c("0","1","2","3"),
name = "MAE") +
facet_wrap(week~Regression, ncol = 4)+
labs(title = 'Mean absolute error by station and by week') +
mapTheme(25,35)+
theme(
legend.position = "bottom",
panel.border = element_rect(color = "black",fill = "transparent", size = 6))
error.byHour <-
filter(week_predictions2, Regression == "E_Space_Time_Lags" & week==24) %>%
unnest() %>%
filter(dayotw == "Mon")%>%
st_sf() %>%
dplyr::select(start_station_name, Absolute_Error,
geometry, interval_hour) %>%
gather(Variable, Value, -interval_hour,-start_station_name, -geometry) %>%
group_by(hour = hour(interval_hour), start_station_name) %>%
summarize(MAE = mean(Value))
The figure below shows the predicted MAE for each station for each hour. it can be seen from the figure that the MAE for each station increases as the morning and evening peaks approach. For example, at 17 and 19 pm, more dark dots appear on the map, representing that the model will be more inaccurate for the peak predictions.
ggplot() +
geom_sf(data=st_union(PHITracts),color="black",size=2,fill = "transparent")+
geom_sf(data=PHITracts,color="grey70",size=1,linetype ="dashed",fill = "transparent")+
geom_sf(data = error.byHour, aes(color = q4(MAE)), size = 2) +
scale_color_manual(values=palette4,
labels = c("0","1","2","2+"),
name = "MAE") +
facet_wrap(~hour, ncol = 8)+
labs(title = 'Mean absolute error by station and by hour') +
mapTheme()+
theme(
legend.position = "bottom",
panel.border = element_rect(color = "black",fill = "transparent", size = 4))
To further validate the generalizability of the model, cross-validate the E_Space_Time_Lags model by time hour (with features, time, spatial fixed effects, additional features and time lags) has been made. Each hour takes turns to be the test set: the model is trained on the other hours of the day and tested on this hour.
crossValidate <- function(dataset, id, dependentVariable, indVariables) {
allPredictions <- data.frame()
cvID_list <- unique(dataset[[id]])
for (i in cvID_list) {
thisFold <- i
cat("This hold out fold is", thisFold, "\n")
fold.train <- filter(dataset, dataset[[id]] != thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry,interval_hour, indVariables, dependentVariable)
fold.test <- filter(dataset, dataset[[id]] == thisFold) %>% as.data.frame() %>%
dplyr::select(id, geometry,interval_hour, indVariables, dependentVariable)
regression <- lm(paste0(dependentVariable,"~."),
data = fold.train %>% dplyr::select(-geometry))
thisPrediction <-
mutate(fold.test, Prediction = predict(regression, fold.test, type = "response"))
allPredictions <-
rbind(allPredictions, thisPrediction)
gc()
}
return(st_sf(allPredictions))
}
ride.cross <- ride5
ride.cross <- ride.cross%>%mutate(hour=hour(interval_hour))
reg.logo_vars <- c("hour","start_station_name","dayotw","Temp","isPrecip","Wind","lagHour","lag2Hours", "lag3Hours","lag4Hours","lag12Hours","lag1day")
reg.logo_hour <- crossValidate(
dataset = ride.cross,
id = "hour",
dependentVariable = "Trip_Count",
indVariables = reg.logo_vars)
From the table we can see that the MAE is concentrated in 1, 2, 3. For the case of lower number of rentals we tend to have higher prediction results, while for the case of higher number of rentals we have lower prediction results, and we have better prediction results when the number of rentals is around 7.5.
reg.logo_hour <- reg.logo_hour%>%
mutate(MAE = abs(Prediction - Trip_Count))%>%
na.omit()
grid.arrange(ncol=2,
reg.logo_hour %>%
ggplot(aes(MAE)) +
geom_histogram(bins = 40, colour="white", fill=palette1_main) +
scale_x_continuous(breaks = seq(0, 3, by = 1)) +
labs(title="Distribution of MAE, LOGO-CV",
x="Mean Absolute Error", y="Trip Count") +
plotTheme()+
theme(legend.position="bottom",
axis.text = element_text(size=25)),
st_drop_geometry(reg.logo_hour) %>%
mutate(Count_Decile = ntile(Trip_Count, 10)) %>%
group_by(Count_Decile) %>%
summarize(meanObserved = mean(Trip_Count, na.rm=T),
meanPrediction = mean(Prediction, na.rm=T)) %>%
gather(Variable, Value, -Count_Decile) %>%
ggplot(aes(Count_Decile, Value, shape = Variable)) +
geom_path(aes(group = Count_Decile), size=2,colour = palette1_assist) +
geom_point(size = 6,color=palette1_main) +
scale_shape_manual(values = c(2, 17)) +
xlim(0,10) +
ylim(0,10)+
labs(title = "Predicted and observed") +
plotTheme()+
theme(legend.position = c(.3,.8)))
reg.logo_hour %>%
st_drop_geometry()%>%
dplyr::select(interval_hour, start_station_name,
Trip_Count, Prediction) %>%
gather(Variable, Value, -interval_hour,-start_station_name) %>%
group_by(Variable, interval_hour) %>%
summarize(Value = mean(Value))%>%
ggplot(aes(interval_hour, Value, colour=Variable)) +
geom_line(size = 2) +
scale_colour_manual(values = palette2) +
labs(
title = "Mean Predicted/Observed rideshare by time",
x = "Time", y= "Rideshare Trips") +
plotTheme()+
theme(legend.position = "bottom")
Presenting the MAE on the map by hour, we can see that the predictions are more accurate in the practice periods with lower usage in the early morning hours. Spatially, the MAE is higher in the central city than in the surrounding areas.
LOGO_SPD <- reg.logo_hour %>%
st_drop_geometry()%>%
filter(dayotw == "Mon")%>%
dplyr::select(hour, start_station_name,MAE) %>%
gather(Variable, Value, -hour,-start_station_name) %>%
group_by(Variable, hour,start_station_name,) %>%
summarize(Value = mean(Value))%>%
left_join(ride_station%>%dplyr::select(-GEOID),by="start_station_name")%>%
st_sf()
ggplot() +
geom_sf(data=st_union(PHITracts),color="black",size=2,fill = "transparent")+
geom_sf(data=PHITracts,color="grey80",size=0.5,fill = "transparent")+
geom_sf(data = LOGO_SPD, aes(color = q4(Value)), size = 2) +
scale_color_manual(values=palette4,
labels = c("0","1","2","2+"),
name = "MAE") +
facet_wrap(~hour, ncol = 8)+
labs(title = 'Mean absolute error by station and by hour - LOGO') +
mapTheme()+
theme(
legend.position = "bottom",
panel.border = element_rect(color = "black",fill = "transparent", size = 3))
This time we analyzed the predicted bicycle rental usage using data from IndegoBike in Philadelphia. First using five different regression model algorithms, we can conclude that the prediction accuracy of the model continues to improve with the addition of spatial features and time lag. However, the model itself still has a tendency to predict higher for lower usage cases and lower for higher usage cases. In general, the model supports advance bicycle dispatching. We can pre-deploy bikes more in advance of the prediction results. For example, we can increase the supply of bicycles in the central city before the high time period, so as to better meet the travel demand of users.