Posted on

The first satRday was held in Budapest the last year. There was a Visualization Challenge and I decided to participate on it. Unfortunatelly, because of technical difficulties, I couldn’t send my work in before the deadline, but I often come back to this data set, for testing out new things like this blog.

Setup

I started to get into R around the satRday, and I learned a lot since then, so I decided I update the a code a bit. First, I split out the data prepration in seperate files, and I made the setup more dynamic.

if (!require("pacman")) install.packages("pacman")
pacman::p_load(rprojroot,
               knitr,
               printr,
               bookdown,
               viridis,
               ggthemes)

root <- find_root(is_rstudio_project)

opts_knit$set(root.dir = root)

knitr::opts_chunk$set(echo = TRUE,
                      cache = TRUE,
                      include = TRUE,
                      message = FALSE,
                      warning = FALSE,
                      error = FALSE,
                      fig.align = 'center',
                      fig.show  = 'asis',
                      # fig.width = 12,
                      # fig.height = 12,
                      fig.retina = TRUE
                      )

# rm(list = ls())
bud_url <- "http://budapest.satrdays.org/data/BUD%20flights%202007-2012%20v2.xlsx"
airports_url <- "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"

raw_path <- "/data/raw/"
flights_name <- paste0(root, raw_path, "flights.xlsx")
airports_name <- paste0(root, raw_path, "airports.csv")

dir.create(paste0(root, raw_path), recursive = TRUE)

if(!file.exists(flights_name)) {
  download.file(url = bud_url,
                destfile = flights_name,
                method = "libcurl", mode = "wb")
}

if(!file.exists(airports_name)) {
  download.file(url = airports_url,
                destfile = airports_name,
                method = "libcurl", mode = "wb")
}

# removes the data folder and its contents recursivelly
# unlink(x = "data*", recursive = TRUE)
pacman::p_load(readxl,
               tidyverse,
               lubridate)

file_list <- dir(paste0(root, raw_path), pattern = ".xlsx", full.names = TRUE)

flights <- read_excel(file_list, sheet = 1)

# colnames(flights) <- tolower(gsub(" ", "_", names(flights)))

# flights_cleaned <- flights %>%
#   mutate(month = month(date)) %>%
#   select(-c(region, commercial_flag, date_half_year, date_year_quarter,
#             date_year_month, date, destination)) %>%
#   filter(!is.na(city))
# 
# char_cols <- names(flights_cleaned[, sapply(flights_cleaned, class) == 'character'])
# 
# flights_cleaned[sapply(flights_cleaned, is.character)] <-
#   lapply(flights_cleaned[sapply(flights_cleaned, is.character)],
#          as.factor)
dimension <- dim(flights)
names(dimension) <- (c("rows", "columns"))

knitr::kable(dimension, caption = "Flights dataset dimensions")
Flights dataset dimensions
rows 18837
columns 16
knitr::kable(t(head(flights, n = 4)), caption = "First few rows of the data")
First few rows of the data
DESTINATION Aalborg Aalborg Aalborg Aarhus
COMMERCIAL FLAG Commercial Commercial Commercial Commercial
CITY Aalborg Aalborg Aalborg Aarhus
REGION NA NA NA NA
COUNTRY Denmark Denmark Denmark Denmark
FLIGH DIRECTION Incoming Outgoing Incoming Incoming
FLIGHT TYPE Non Scheduled Non Scheduled Non Scheduled Non Scheduled
DATE 2010-11-01 2010-12-01 2011-01-01 2007-05-01
DATE YEAR 2010 2010 2011 2007
DATE HALF YEAR 2010H2 2010H2 2011H1 2007H1
DATE YEAR QUARTER 2010Q4 2010Q4 2011Q1 2007Q2
DATE YEAR MONTH 201011 201012 201101 200705
NBR OF PASSENGERS 0 0 0 102
CARGO WEIGHT 0 0 0 0
NBR OF FLIGHTS 1 1 1 1
SEAT CAPACITY 0 0 0 200
col_meaning <- data.frame(names(flights),
                  c("Flight destination", "Aircraft type", "Destination city",
                    "Country region", "Country", "The direction of the flight",
                    "Flight type", "Flight date", "Flight year",
                    "Flight half year", "Flight quarter", "Flight month",
                    "Number of passengers", "Weight of transported cargo",
                    "Number of flights on this route", "Seat capacity"))

names(col_meaning) <- c("Column name", "Description of the data")

knitr::kable(col_meaning, caption = "Column names, and desctiption")
Column names, and desctiption
Column name Description of the data
DESTINATION Flight destination
COMMERCIAL FLAG Aircraft type
CITY Destination city
REGION Country region
COUNTRY Country
FLIGH DIRECTION The direction of the flight
FLIGHT TYPE Flight type
DATE Flight date
DATE YEAR Flight year
DATE HALF YEAR Flight half year
DATE YEAR QUARTER Flight quarter
DATE YEAR MONTH Flight month
NBR OF PASSENGERS Number of passengers
CARGO WEIGHT Weight of transported cargo
NBR OF FLIGHTS Number of flights on this route
SEAT CAPACITY Seat capacity
na_values <- map(.x = flights, .f = is.na) %>%
  map(.f = sum) %>%
  as.data.frame() %>%
  t()

knitr::kable(na_values, caption = "Number of missing values in the columns")
Number of missing values in the columns
DESTINATION 2
COMMERCIAL.FLAG 0
CITY 2
REGION 14520
COUNTRY 0
FLIGH.DIRECTION 0
FLIGHT.TYPE 0
DATE 0
DATE.YEAR 0
DATE.HALF.YEAR 0
DATE.YEAR.QUARTER 0
DATE.YEAR.MONTH 0
NBR.OF.PASSENGERS 0
CARGO.WEIGHT 0
NBR.OF.FLIGHTS 0
SEAT.CAPACITY 0
colnames(flights) <- tolower(gsub(" ", "_", names(flights)))

flights_filtered <- flights %>%
  select(-c(commercial_flag)) %>%
  extract(date_half_year, regex = "([A-Z][0-9])",
          into = "date_half_year") %>%
  extract(date_year_quarter, regex = "([A-Z][0-9])",
          into = "date_year_quarter") %>%
  extract(date_year_month, regex = "[0-9]{4}([0-9]{2})",
          into = "date_year_month") %>%
  filter(!is.na(city)) %>%
  select(-c(city))
eda_plot <- ggplot(flights, aes(nbr_of_passengers, seat_capacity)) +
  geom_point() +
  xlab("Nr. of passengers/route/month") +
  ylab("Seat capacity/route/month")

eda_plot
First look at the data

First look at the data

# dir.create(paste0(root, "img"), recursive = TRUE)
# 
# ggsave(filename = paste0(root, "img/eda_plot.png"), device = "png",
#        plot = eda_plot, limitsize = TRUE, dpi = 72)
# 
# knitr::include_graphics(paste0(root, "img/eda_plot.png"))
# flights <- read_csv("data/interim/flights_v2.csv")

flights <- flights %>%
  mutate(lg_pass = log2(nbr_of_passengers),
         lg_seat = log2(seat_capacity))

flights[!is.finite(flights$lg_pass), "lg_pass"] <- 0
flights[!is.finite(flights$lg_seat), "lg_seat"] <- 0

mod <- lm(lg_seat ~ lg_pass, data = flights)

flights <- flights %>% mutate(rel_seat = resid(mod))

binned_plot <- ggplot(flights, aes(lg_pass, lg_seat - lg_pass)) +
  geom_bin2d() +
  xlab("Log2 of Nr. of Passengers/route/month") +
  ylab("Log2 of Seat Capacity/route/month")

binned_plot
Log converted data

Log converted data

# ggsave(filename = "img/binned_plot.png", device = "png",
#        plot = binned_plot, limitsize = TRUE, dpi = 72)
# 
# knitr::include_graphics("img/binned_plot.png")
# plot with background
# from:
# https://drsimonj.svbtle.com/plotting-background-data-for-groups-with-ggplot2
flights_all <- flights %>% select(-date_year)

yearly_dist <- ggplot(flights, aes(lg_pass, lg_seat - lg_pass)) +
  geom_point(data = flights_all, color = "grey", alpha = 0.2) +
  geom_point(aes(color = factor(date_year))) +
  facet_wrap(~date_year, ncol = 3) +
  scale_color_viridis(discrete = TRUE, "Year") + theme_bw() +
  xlab("Log2 of Nr. of Passengers per route per month") +
  ylab("Log2 of Seat Capacity per route per month")

yearly_dist

# ggsave(filename = "img/yearly_dist.png", device = "png",
#        plot = yearly_dist, limitsize = TRUE, dpi = 100)
# 
# knitr::include_graphics("img/yearly_dist.png")
# from: https://mvuorre.github.io/r/github-waffle-plot/
geom_waffle <- function(data, date_par, data_scale,
                        scale_name, pal = "D", dir = -1){
  
waffle_plot <- ggplot(data, aes(x = month(date_par), y = year(date_par),
                               fill = data_scale)) +
    scale_fill_viridis(name = scale_name, 
                       option = pal,  # Variable color palette
                       direction = dir,  # Variable color direction
                       na.value = "grey93",
                       limits = c(0, max(data_scale))) +
    geom_tile(color = "white", size = 0.4) +
    scale_x_continuous(
      expand = c(0, 0),
      breaks = seq(length = 12),
      labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) +
    scale_y_continuous(
      expand = c(0, 0),
      breaks = seq(min(year(date_par)),
                   max(year(date_par)), by = 1) 
    ) +
    theme_tufte(base_family = "Helvetica") +
    theme(axis.title = element_text(),
          axis.ticks = element_blank(),
          legend.position = "bottom",
          legend.key.width = unit(1, "cm"),
          strip.text = element_text(hjust = 0.01, face = "bold", size = 12)) +
    xlab("month") +
    ylab("year")
  
  return(waffle_plot)
}
# flights <- read_csv("data/interim/flights_v2.csv")

waffle1 <- geom_waffle(flights, flights$date,
            flights$nbr_of_passengers,
            scale_name = "Nr. of Passengers/route/month")

waffle1
Heatmap of passengers around the years

Heatmap of passengers around the years

# ggsave(filename = "img/waffle1.png", device = "png",
#        plot = waffle1, limitsize = TRUE, dpi = 72)
# 
# knitr::include_graphics("img/waffle1.png")
waffle2 <- geom_waffle(flights, flights$date,
            flights$seat_capacity,
            scale_name = "Seat Capacity per route per month")
waffle2
Heatmap of the seat capacity around the years

Heatmap of the seat capacity around the years

# ggsave(filename = "img/waffle2.png", device = "png",
#        plot = waffle2, limitsize = TRUE, dpi = 72)
# 
# knitr::include_graphics("img/waffle2.png")
waffle3 <- geom_waffle(flights, flights$date,
            flights$nbr_of_flights,
            scale_name = "Nr. of Flights per route per month")

waffle3
Heatmap of flights around the years

Heatmap of flights around the years

# ggsave(filename = "img/waffle3.png", device = "png",
#        plot = waffle3, limitsize = TRUE, dpi = 72)
# 
# knitr::include_graphics("img/waffle3.png")

I also made a heatmap which shows which month had the most passengers for the top 25 countries (Figure ??):

grp_flights <- flights %>%
  mutate(years = as.numeric(date_year) + as.numeric(date_year_month)/12) %>%
  group_by(country, date_year, date_year_month, years) %>%
  summarise(nbr_pass = max(log2(sum(nbr_of_passengers)), 0))

top_countries <- flights %>%
  group_by(country) %>%
  summarise(nbr_pass_country = sum(nbr_of_passengers)) %>%
  top_n(n = 25, wt = nbr_pass_country)

country_list <- as.list(top_countries$country)

grp_flights_filtered <- grp_flights %>%
  slice(country %in% top_countries$country)

min_pass <- min(grp_flights_filtered$nbr_pass)
max_pass <- max(grp_flights_filtered$nbr_pass)
bins <- 5
breaks <- seq(round(min_pass,2), round(max_pass,2) ,
              by = round((max_pass-min_pass)/bins, 2))

heatmap <- ggplot(grp_flights_filtered, aes(y=country, x=years, fill=nbr_pass)) + 
  geom_tile(color="white",
            width=.9, height=.9) + theme_minimal() +
  scale_fill_gradientn(colors = viridis::viridis(bins),
                       limits=c(min_pass, max_pass),
                       breaks = breaks,
                       na.value=rgb(246, 246, 246, max=255),
                       labels = breaks,
                       guide=guide_colourbar(ticks=T, nbin=bins,
                                             barheight=.5, label=T, 
                                             barwidth=1+sqrt(bins*10))) +
  scale_x_continuous(expand=c(0,0), 
                     breaks=seq(2007, 2012, by=1)) +
  labs(x="", y="", fill="") +
  ggtitle("Nr. of montly passengers") +
  theme(legend.position=c(.5, -.10),
        legend.direction="horizontal",
        legend.text=element_text(colour="grey20", size = 6),
        plot.margin=grid::unit(c(.5,1.5,1.5,.1), "cm"),
        axis.text.y=element_text(size=6, 
                                 hjust=1),
        axis.text.x=element_text(size=8),
        axis.ticks.y=element_blank(),
        panel.grid=element_blank(),
        title=element_text(hjust=-.07, face="bold", vjust=1))

heatmap
Heatmap of the passenger numbers for the top 25 countries

Heatmap of the passenger numbers for the top 25 countries

# ggsave(filename = "img/heatmap.png", device = "png",
#        plot = heatmap, limitsize = TRUE, dpi = 120)
# 
# knitr::include_graphics("img/heatmap.png")
library(GGally)
cor_mat <- cor(data.matrix(flights_filtered))

ggcorr(cor_mat)
Correlations between variables

Correlations between variables