The first satRday was held in Budapest the last year. There was a Visualization Challenge and I decided to participate on it. Unfortunatelly, because of technical difficulties, I couldn’t send my work in before the deadline, but I often come back to this data set, for testing out new things like this blog.
Setup
I started to get into R around the satRday, and I learned a lot since then, so I decided I update the a code a bit. First, I split out the data prepration in seperate files, and I made the setup more dynamic.
if (!require("pacman")) install.packages("pacman")
pacman::p_load(rprojroot,
knitr,
printr,
bookdown,
viridis,
ggthemes)
root <- find_root(is_rstudio_project)
opts_knit$set(root.dir = root)
knitr::opts_chunk$set(echo = TRUE,
cache = TRUE,
include = TRUE,
message = FALSE,
warning = FALSE,
error = FALSE,
fig.align = 'center',
fig.show = 'asis',
# fig.width = 12,
# fig.height = 12,
fig.retina = TRUE
)
# rm(list = ls())
bud_url <- "http://budapest.satrdays.org/data/BUD%20flights%202007-2012%20v2.xlsx"
airports_url <- "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
raw_path <- "/data/raw/"
flights_name <- paste0(root, raw_path, "flights.xlsx")
airports_name <- paste0(root, raw_path, "airports.csv")
dir.create(paste0(root, raw_path), recursive = TRUE)
if(!file.exists(flights_name)) {
download.file(url = bud_url,
destfile = flights_name,
method = "libcurl", mode = "wb")
}
if(!file.exists(airports_name)) {
download.file(url = airports_url,
destfile = airports_name,
method = "libcurl", mode = "wb")
}
# removes the data folder and its contents recursivelly
# unlink(x = "data*", recursive = TRUE)
pacman::p_load(readxl,
tidyverse,
lubridate)
file_list <- dir(paste0(root, raw_path), pattern = ".xlsx", full.names = TRUE)
flights <- read_excel(file_list, sheet = 1)
# colnames(flights) <- tolower(gsub(" ", "_", names(flights)))
# flights_cleaned <- flights %>%
# mutate(month = month(date)) %>%
# select(-c(region, commercial_flag, date_half_year, date_year_quarter,
# date_year_month, date, destination)) %>%
# filter(!is.na(city))
#
# char_cols <- names(flights_cleaned[, sapply(flights_cleaned, class) == 'character'])
#
# flights_cleaned[sapply(flights_cleaned, is.character)] <-
# lapply(flights_cleaned[sapply(flights_cleaned, is.character)],
# as.factor)
dimension <- dim(flights)
names(dimension) <- (c("rows", "columns"))
knitr::kable(dimension, caption = "Flights dataset dimensions")
Flights dataset dimensions
rows |
18837 |
columns |
16 |
knitr::kable(t(head(flights, n = 4)), caption = "First few rows of the data")
First few rows of the data
DESTINATION |
Aalborg |
Aalborg |
Aalborg |
Aarhus |
COMMERCIAL FLAG |
Commercial |
Commercial |
Commercial |
Commercial |
CITY |
Aalborg |
Aalborg |
Aalborg |
Aarhus |
REGION |
NA |
NA |
NA |
NA |
COUNTRY |
Denmark |
Denmark |
Denmark |
Denmark |
FLIGH DIRECTION |
Incoming |
Outgoing |
Incoming |
Incoming |
FLIGHT TYPE |
Non Scheduled |
Non Scheduled |
Non Scheduled |
Non Scheduled |
DATE |
2010-11-01 |
2010-12-01 |
2011-01-01 |
2007-05-01 |
DATE YEAR |
2010 |
2010 |
2011 |
2007 |
DATE HALF YEAR |
2010H2 |
2010H2 |
2011H1 |
2007H1 |
DATE YEAR QUARTER |
2010Q4 |
2010Q4 |
2011Q1 |
2007Q2 |
DATE YEAR MONTH |
201011 |
201012 |
201101 |
200705 |
NBR OF PASSENGERS |
0 |
0 |
0 |
102 |
CARGO WEIGHT |
0 |
0 |
0 |
0 |
NBR OF FLIGHTS |
1 |
1 |
1 |
1 |
SEAT CAPACITY |
0 |
0 |
0 |
200 |
col_meaning <- data.frame(names(flights),
c("Flight destination", "Aircraft type", "Destination city",
"Country region", "Country", "The direction of the flight",
"Flight type", "Flight date", "Flight year",
"Flight half year", "Flight quarter", "Flight month",
"Number of passengers", "Weight of transported cargo",
"Number of flights on this route", "Seat capacity"))
names(col_meaning) <- c("Column name", "Description of the data")
knitr::kable(col_meaning, caption = "Column names, and desctiption")
Column names, and desctiption
DESTINATION |
Flight destination |
COMMERCIAL FLAG |
Aircraft type |
CITY |
Destination city |
REGION |
Country region |
COUNTRY |
Country |
FLIGH DIRECTION |
The direction of the flight |
FLIGHT TYPE |
Flight type |
DATE |
Flight date |
DATE YEAR |
Flight year |
DATE HALF YEAR |
Flight half year |
DATE YEAR QUARTER |
Flight quarter |
DATE YEAR MONTH |
Flight month |
NBR OF PASSENGERS |
Number of passengers |
CARGO WEIGHT |
Weight of transported cargo |
NBR OF FLIGHTS |
Number of flights on this route |
SEAT CAPACITY |
Seat capacity |
na_values <- map(.x = flights, .f = is.na) %>%
map(.f = sum) %>%
as.data.frame() %>%
t()
knitr::kable(na_values, caption = "Number of missing values in the columns")
Number of missing values in the columns
DESTINATION |
2 |
COMMERCIAL.FLAG |
0 |
CITY |
2 |
REGION |
14520 |
COUNTRY |
0 |
FLIGH.DIRECTION |
0 |
FLIGHT.TYPE |
0 |
DATE |
0 |
DATE.YEAR |
0 |
DATE.HALF.YEAR |
0 |
DATE.YEAR.QUARTER |
0 |
DATE.YEAR.MONTH |
0 |
NBR.OF.PASSENGERS |
0 |
CARGO.WEIGHT |
0 |
NBR.OF.FLIGHTS |
0 |
SEAT.CAPACITY |
0 |
colnames(flights) <- tolower(gsub(" ", "_", names(flights)))
flights_filtered <- flights %>%
select(-c(commercial_flag)) %>%
extract(date_half_year, regex = "([A-Z][0-9])",
into = "date_half_year") %>%
extract(date_year_quarter, regex = "([A-Z][0-9])",
into = "date_year_quarter") %>%
extract(date_year_month, regex = "[0-9]{4}([0-9]{2})",
into = "date_year_month") %>%
filter(!is.na(city)) %>%
select(-c(city))
eda_plot <- ggplot(flights, aes(nbr_of_passengers, seat_capacity)) +
geom_point() +
xlab("Nr. of passengers/route/month") +
ylab("Seat capacity/route/month")
eda_plot
# dir.create(paste0(root, "img"), recursive = TRUE)
#
# ggsave(filename = paste0(root, "img/eda_plot.png"), device = "png",
# plot = eda_plot, limitsize = TRUE, dpi = 72)
#
# knitr::include_graphics(paste0(root, "img/eda_plot.png"))
# flights <- read_csv("data/interim/flights_v2.csv")
flights <- flights %>%
mutate(lg_pass = log2(nbr_of_passengers),
lg_seat = log2(seat_capacity))
flights[!is.finite(flights$lg_pass), "lg_pass"] <- 0
flights[!is.finite(flights$lg_seat), "lg_seat"] <- 0
mod <- lm(lg_seat ~ lg_pass, data = flights)
flights <- flights %>% mutate(rel_seat = resid(mod))
binned_plot <- ggplot(flights, aes(lg_pass, lg_seat - lg_pass)) +
geom_bin2d() +
xlab("Log2 of Nr. of Passengers/route/month") +
ylab("Log2 of Seat Capacity/route/month")
binned_plot
# ggsave(filename = "img/binned_plot.png", device = "png",
# plot = binned_plot, limitsize = TRUE, dpi = 72)
#
# knitr::include_graphics("img/binned_plot.png")
# plot with background
# from:
# https://drsimonj.svbtle.com/plotting-background-data-for-groups-with-ggplot2
flights_all <- flights %>% select(-date_year)
yearly_dist <- ggplot(flights, aes(lg_pass, lg_seat - lg_pass)) +
geom_point(data = flights_all, color = "grey", alpha = 0.2) +
geom_point(aes(color = factor(date_year))) +
facet_wrap(~date_year, ncol = 3) +
scale_color_viridis(discrete = TRUE, "Year") + theme_bw() +
xlab("Log2 of Nr. of Passengers per route per month") +
ylab("Log2 of Seat Capacity per route per month")
yearly_dist
# ggsave(filename = "img/yearly_dist.png", device = "png",
# plot = yearly_dist, limitsize = TRUE, dpi = 100)
#
# knitr::include_graphics("img/yearly_dist.png")
# from: https://mvuorre.github.io/r/github-waffle-plot/
geom_waffle <- function(data, date_par, data_scale,
scale_name, pal = "D", dir = -1){
waffle_plot <- ggplot(data, aes(x = month(date_par), y = year(date_par),
fill = data_scale)) +
scale_fill_viridis(name = scale_name,
option = pal, # Variable color palette
direction = dir, # Variable color direction
na.value = "grey93",
limits = c(0, max(data_scale))) +
geom_tile(color = "white", size = 0.4) +
scale_x_continuous(
expand = c(0, 0),
breaks = seq(length = 12),
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
scale_y_continuous(
expand = c(0, 0),
breaks = seq(min(year(date_par)),
max(year(date_par)), by = 1)
) +
theme_tufte(base_family = "Helvetica") +
theme(axis.title = element_text(),
axis.ticks = element_blank(),
legend.position = "bottom",
legend.key.width = unit(1, "cm"),
strip.text = element_text(hjust = 0.01, face = "bold", size = 12)) +
xlab("month") +
ylab("year")
return(waffle_plot)
}
# flights <- read_csv("data/interim/flights_v2.csv")
waffle1 <- geom_waffle(flights, flights$date,
flights$nbr_of_passengers,
scale_name = "Nr. of Passengers/route/month")
waffle1
# ggsave(filename = "img/waffle1.png", device = "png",
# plot = waffle1, limitsize = TRUE, dpi = 72)
#
# knitr::include_graphics("img/waffle1.png")
waffle2 <- geom_waffle(flights, flights$date,
flights$seat_capacity,
scale_name = "Seat Capacity per route per month")
waffle2
# ggsave(filename = "img/waffle2.png", device = "png",
# plot = waffle2, limitsize = TRUE, dpi = 72)
#
# knitr::include_graphics("img/waffle2.png")
waffle3 <- geom_waffle(flights, flights$date,
flights$nbr_of_flights,
scale_name = "Nr. of Flights per route per month")
waffle3
# ggsave(filename = "img/waffle3.png", device = "png",
# plot = waffle3, limitsize = TRUE, dpi = 72)
#
# knitr::include_graphics("img/waffle3.png")
I also made a heatmap which shows which month had the most passengers for the top 25 countries (Figure ??):
grp_flights <- flights %>%
mutate(years = as.numeric(date_year) + as.numeric(date_year_month)/12) %>%
group_by(country, date_year, date_year_month, years) %>%
summarise(nbr_pass = max(log2(sum(nbr_of_passengers)), 0))
top_countries <- flights %>%
group_by(country) %>%
summarise(nbr_pass_country = sum(nbr_of_passengers)) %>%
top_n(n = 25, wt = nbr_pass_country)
country_list <- as.list(top_countries$country)
grp_flights_filtered <- grp_flights %>%
slice(country %in% top_countries$country)
min_pass <- min(grp_flights_filtered$nbr_pass)
max_pass <- max(grp_flights_filtered$nbr_pass)
bins <- 5
breaks <- seq(round(min_pass,2), round(max_pass,2) ,
by = round((max_pass-min_pass)/bins, 2))
heatmap <- ggplot(grp_flights_filtered, aes(y=country, x=years, fill=nbr_pass)) +
geom_tile(color="white",
width=.9, height=.9) + theme_minimal() +
scale_fill_gradientn(colors = viridis::viridis(bins),
limits=c(min_pass, max_pass),
breaks = breaks,
na.value=rgb(246, 246, 246, max=255),
labels = breaks,
guide=guide_colourbar(ticks=T, nbin=bins,
barheight=.5, label=T,
barwidth=1+sqrt(bins*10))) +
scale_x_continuous(expand=c(0,0),
breaks=seq(2007, 2012, by=1)) +
labs(x="", y="", fill="") +
ggtitle("Nr. of montly passengers") +
theme(legend.position=c(.5, -.10),
legend.direction="horizontal",
legend.text=element_text(colour="grey20", size = 6),
plot.margin=grid::unit(c(.5,1.5,1.5,.1), "cm"),
axis.text.y=element_text(size=6,
hjust=1),
axis.text.x=element_text(size=8),
axis.ticks.y=element_blank(),
panel.grid=element_blank(),
title=element_text(hjust=-.07, face="bold", vjust=1))
heatmap
# ggsave(filename = "img/heatmap.png", device = "png",
# plot = heatmap, limitsize = TRUE, dpi = 120)
#
# knitr::include_graphics("img/heatmap.png")
library(GGally)
cor_mat <- cor(data.matrix(flights_filtered))
ggcorr(cor_mat)