library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.1.5
## ✔ ggplot2 4.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
search()
## [1] ".GlobalEnv" "package:nycflights13" "package:lubridate"
## [4] "package:forcats" "package:stringr" "package:purrr"
## [7] "package:readr" "package:tidyr" "package:ggplot2"
## [10] "package:tidyverse" "package:tibble" "package:dplyr"
## [13] "package:stats" "package:graphics" "package:grDevices"
## [16] "package:utils" "package:datasets" "package:methods"
## [19] "Autoloads" "package:base"
ls("package:nycflights13")
## [1] "airlines" "airports" "flights" "planes" "weather"
ls()
## character(0)
tFlights <- nycflights13::flights
tWeather <- nycflights13::weather
ls()
## [1] "tFlights" "tWeather"
head(tFlights, 10)
## # A tibble: 10 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
head(tWeather, 10)
## # A tibble: 10 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
str(tFlights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(tFlights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
str(tWeather)
## tibble [26,115 × 15] (S3: tbl_df/tbl/data.frame)
## $ origin : chr [1:26115] "EWR" "EWR" "EWR" "EWR" ...
## $ year : int [1:26115] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
## $ hour : int [1:26115] 1 2 3 4 5 6 7 8 9 10 ...
## $ temp : num [1:26115] 39 39 39 39.9 39 ...
## $ dewp : num [1:26115] 26.1 27 28 28 28 ...
## $ humid : num [1:26115] 59.4 61.6 64.4 62.2 64.4 ...
## $ wind_dir : num [1:26115] 270 250 240 250 260 240 240 250 260 260 ...
## $ wind_speed: num [1:26115] 10.36 8.06 11.51 12.66 12.66 ...
## $ wind_gust : num [1:26115] NA NA NA NA NA NA NA NA NA NA ...
## $ precip : num [1:26115] 0 0 0 0 0 0 0 0 0 0 ...
## $ pressure : num [1:26115] 1012 1012 1012 1012 1012 ...
## $ visib : num [1:26115] 10 10 10 10 10 10 10 10 10 10 ...
## $ time_hour : POSIXct[1:26115], format: "2013-01-01 01:00:00" "2013-01-01 02:00:00" ...
summary(tWeather)
## origin year month day
## Length:26115 Min. :2013 Min. : 1.000 Min. : 1.00
## Class :character 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00
## Mode :character Median :2013 Median : 7.000 Median :16.00
## Mean :2013 Mean : 6.504 Mean :15.68
## 3rd Qu.:2013 3rd Qu.: 9.000 3rd Qu.:23.00
## Max. :2013 Max. :12.000 Max. :31.00
##
## hour temp dewp humid
## Min. : 0.00 Min. : 10.94 Min. :-9.94 Min. : 12.74
## 1st Qu.: 6.00 1st Qu.: 39.92 1st Qu.:26.06 1st Qu.: 47.05
## Median :11.00 Median : 55.40 Median :42.08 Median : 61.79
## Mean :11.49 Mean : 55.26 Mean :41.44 Mean : 62.53
## 3rd Qu.:17.00 3rd Qu.: 69.98 3rd Qu.:57.92 3rd Qu.: 78.79
## Max. :23.00 Max. :100.04 Max. :78.08 Max. :100.00
## NA's :1 NA's :1 NA's :1
## wind_dir wind_speed wind_gust precip
## Min. : 0.0 Min. : 0.000 Min. :16.11 Min. :0.000000
## 1st Qu.:120.0 1st Qu.: 6.905 1st Qu.:20.71 1st Qu.:0.000000
## Median :220.0 Median : 10.357 Median :24.17 Median :0.000000
## Mean :199.8 Mean : 10.517 Mean :25.49 Mean :0.004469
## 3rd Qu.:290.0 3rd Qu.: 13.809 3rd Qu.:28.77 3rd Qu.:0.000000
## Max. :360.0 Max. :1048.361 Max. :66.75 Max. :1.210000
## NA's :460 NA's :4 NA's :20778
## pressure visib time_hour
## Min. : 983.8 Min. : 0.000 Min. :2013-01-01 01:00:00
## 1st Qu.:1012.9 1st Qu.:10.000 1st Qu.:2013-04-01 21:30:00
## Median :1017.6 Median :10.000 Median :2013-07-01 14:00:00
## Mean :1017.9 Mean : 9.255 Mean :2013-07-01 18:26:37
## 3rd Qu.:1023.0 3rd Qu.:10.000 3rd Qu.:2013-09-30 13:00:00
## Max. :1042.1 Max. :10.000 Max. :2013-12-30 18:00:00
## NA's :2729
# tFlights columns used:
# year, month, day, dep_time, arr_time, sched_dep_time, sched_arr_time, dep_delay,
# arr_delay, carrier, flight, tailnum, origin, dest, air_time, distance, hour, minute, time_hour
# tWeather columns used:
# origin, year, month, day, hour, temp, dewp, humid, wind_dir, wind_speed,
# wind_gust, precip, pressure, visib, time_hour
tFlightWeather <- merge(tFlights, tWeather, by = c("origin", "time_hour"))
str(tFlightWeather)
## 'data.frame': 335220 obs. of 32 variables:
## $ origin : chr "EWR" "EWR" "EWR" "EWR" ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
## $ year.x : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month.x : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day.x : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 554 732 629 624 643 644 643 601 559 ...
## $ sched_dep_time: int 515 558 645 630 630 646 636 645 600 600 ...
## $ dep_delay : num 2 -4 47 -1 -6 -3 8 -2 1 -1 ...
## $ arr_time : int 830 740 1011 824 909 922 931 837 844 854 ...
## $ sched_arr_time: int 819 728 941 833 840 940 940 848 850 902 ...
## $ arr_delay : num 11 12 30 -9 29 -18 -9 -11 -6 -8 ...
## $ carrier : chr "UA" "UA" "UA" "US" ...
## $ flight : int 1545 1696 1111 1019 4626 556 1701 926 343 1187 ...
## $ tailnum : chr "N14228" "N39463" "N37456" "N426US" ...
## $ dest : chr "IAH" "ORD" "MCO" "CLT" ...
## $ air_time : num 227 150 145 91 190 146 151 91 147 337 ...
## $ distance : num 1400 719 937 529 1008 ...
## $ hour.x : num 5 5 6 6 6 6 6 6 6 6 ...
## $ minute : num 15 58 45 30 30 46 36 45 0 0 ...
## $ year.y : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month.y : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day.y : int 1 1 1 1 1 1 1 1 1 1 ...
## $ hour.y : int 5 5 6 6 6 6 6 6 6 6 ...
## $ temp : num 39 39 37.9 37.9 37.9 ...
## $ dewp : num 28 28 28 28 28 ...
## $ humid : num 64.4 64.4 67.2 67.2 67.2 ...
## $ wind_dir : num 260 260 240 240 240 240 240 240 240 240 ...
## $ wind_speed : num 12.7 12.7 11.5 11.5 11.5 ...
## $ wind_gust : num NA NA NA NA NA NA NA NA NA NA ...
## $ precip : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pressure : num 1012 1012 1012 1012 1012 ...
## $ visib : num 10 10 10 10 10 10 10 10 10 10 ...
nrow(tFlights)
## [1] 336776
nrow(tWeather)
## [1] 26115
nrow(tFlightWeather)
## [1] 335220
nrow(tFlights) - nrow(tFlightWeather)
## [1] 1556
missingFlights = anti_join(tFlights, tWeather, by = c("origin", "time_hour"))
missingFlights
## # A tibble: 1,556 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1153 1200 -7 1450 1529
## 2 2013 1 1 1154 1200 -6 1253 1306
## 3 2013 1 1 1155 1200 -5 1517 1510
## 4 2013 1 1 1155 1200 -5 1312 1315
## 5 2013 1 1 1157 1200 -3 1452 1456
## 6 2013 1 1 1158 1200 -2 1256 1300
## 7 2013 1 1 1200 1200 0 1408 1356
## 8 2013 1 1 1202 1207 -5 1318 1314
## 9 2013 1 1 1203 1205 -2 1501 1437
## 10 2013 1 1 1203 1200 3 1519 1545
## # ℹ 1,546 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# The merge kept only rows where both tFlights and tWeather share the same origin and time_hour. All others were dropped.
# No automatic factor conversion occurs in R >= 4.0.
getRversion()
## [1] '4.5.2'
# 1. Columns with .x and .y: year, month, day, hour
# 2. year, month, and day are redundant and one version can be dropped. hour differs in data type between the two tibbles.
identical(tFlightWeather$year.x, tFlightWeather$year.y)
## [1] TRUE
identical(tFlightWeather$month.x, tFlightWeather$month.y)
## [1] TRUE
identical(tFlightWeather$day.x, tFlightWeather$day.y)
## [1] TRUE
identical(tFlightWeather$hour.x, tFlightWeather$hour.y)
## [1] FALSE
# 3. origin and time_hour were used as merge keys, so R keeps only one version of each automatically.
dfMyFlightsWeather <- tFlightWeather %>%
select(!c(year.y, month.y, day.y, hour.y))
dfMyFlightsWeather <- dfMyFlightsWeather %>%
rename(
year = year.x,
month = month.x,
day = day.x,
hour = hour.x
)
str(dfMyFlightsWeather)
## 'data.frame': 335220 obs. of 28 variables:
## $ origin : chr "EWR" "EWR" "EWR" "EWR" ...
## $ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int 517 554 732 629 624 643 644 643 601 559 ...
## $ sched_dep_time: int 515 558 645 630 630 646 636 645 600 600 ...
## $ dep_delay : num 2 -4 47 -1 -6 -3 8 -2 1 -1 ...
## $ arr_time : int 830 740 1011 824 909 922 931 837 844 854 ...
## $ sched_arr_time: int 819 728 941 833 840 940 940 848 850 902 ...
## $ arr_delay : num 11 12 30 -9 29 -18 -9 -11 -6 -8 ...
## $ carrier : chr "UA" "UA" "UA" "US" ...
## $ flight : int 1545 1696 1111 1019 4626 556 1701 926 343 1187 ...
## $ tailnum : chr "N14228" "N39463" "N37456" "N426US" ...
## $ dest : chr "IAH" "ORD" "MCO" "CLT" ...
## $ air_time : num 227 150 145 91 190 146 151 91 147 337 ...
## $ distance : num 1400 719 937 529 1008 ...
## $ hour : num 5 5 6 6 6 6 6 6 6 6 ...
## $ minute : num 15 58 45 30 30 46 36 45 0 0 ...
## $ temp : num 39 39 37.9 37.9 37.9 ...
## $ dewp : num 28 28 28 28 28 ...
## $ humid : num 64.4 64.4 67.2 67.2 67.2 ...
## $ wind_dir : num 260 260 240 240 240 240 240 240 240 240 ...
## $ wind_speed : num 12.7 12.7 11.5 11.5 11.5 ...
## $ wind_gust : num NA NA NA NA NA NA NA NA NA NA ...
## $ precip : num 0 0 0 0 0 0 0 0 0 0 ...
## $ pressure : num 1012 1012 1012 1012 1012 ...
## $ visib : num 10 10 10 10 10 10 10 10 10 10 ...
pMyFlightsWeather = ggplot(dfMyFlightsWeather, aes(x = dep_delay)) + geom_histogram()
pMyFlightsWeather
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 8227 rows containing non-finite outside the scale range
## (`stat_bin()`).
summary(dfMyFlightsWeather$dep_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -43.00 -5.00 -2.00 12.64 11.00 1301.00 8227
# The average departure delay was 12.64 minutes
# The median departure delay was -2 minutes (2 minutes early)
# 25% of flights departed 5 minutes early
# 75% of flights were delayed no more than 11 minutes
# 25% of flights were delayed more than 11 minutes
dfCanceled <- dfMyFlightsWeather %>%
filter(is.na(dep_delay))
pCanceled <- ggplot(dfCanceled, aes(x = dep_delay)) + geom_histogram()
pCanceled
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 8227 rows containing non-finite outside the scale range
## (`stat_bin()`).
# The histogram is empty because all canceled flights have NA in dep_delay. R has no numeric values to plot.
delayCutoff <- 100
dfDelayedCutoff <- dfMyFlightsWeather %>%
filter(!is.na(dep_delay) & dep_delay > delayCutoff)
pDelayedCutoff <- ggplot(dfDelayedCutoff, aes(x = dep_delay)) + geom_histogram()
pDelayedCutoff
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
summary(dfDelayedCutoff$dep_delay)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 101.0 119.0 145.0 166.1 190.0 1301.0
pTemperature = ggplot(dfDelayedCutoff, aes(x = temp, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Temperature (F)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Temperature")
pTemperature
## `geom_smooth()` using formula = 'y ~ x'
# Delays are spread across all temperatures. No strong trend; a slight upward slope may suggest warmer temps have marginally higher delays but the pattern is weak.
pTemperatureColor = ggplot(dfDelayedCutoff, aes(x = temp, y = dep_delay, color = origin)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Temperature (F)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Temperature by Origin")
pTemperatureColor
## `geom_smooth()` using formula = 'y ~ x'
# No strong trend overall. EWR, LGA, and JFK show similar scatter patterns with no major differences by airport.
pDewPoint = ggplot(dfDelayedCutoff, aes(x = dewp, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Dew Point (F)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Dew Point")
pDewPoint
## `geom_smooth()` using formula = 'y ~ x'
# Similar to temperature -- delays are scattered across dew points with no strong trend.
pHumidity = ggplot(dfDelayedCutoff, aes(x = humid, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Relative Humidity (%)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Humidity")
pHumidity
## `geom_smooth()` using formula = 'y ~ x'
# Delays happen at all humidity levels. No strong trend, though very high humidity shows more variation in delays.
pWindDir = ggplot(dfDelayedCutoff, aes(x = wind_dir, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Wind Direction (degrees)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Wind Direction")
pWindDir
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 358 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 358 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Delays are spread across all wind directions. Direction alone does not strongly affect delays.
pWindSpeed = ggplot(dfDelayedCutoff, aes(x = wind_speed, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Wind Speed (mph)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Wind Speed")
pWindSpeed
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
# Most delays happen at normal wind speeds under 20 mph. Extreme wind speeds are rare but can be associated with larger delays.
pWindGust = ggplot(dfDelayedCutoff, aes(x = wind_gust, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Wind Gust (mph)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Wind Gust")
pWindGust
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 10050 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 10050 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Many flights have no recorded gusts. When gusts exist, they may link with higher delays but the pattern is not strong.
.
pPrecip = ggplot(dfDelayedCutoff, aes(x = precip, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Precipitation (inches)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Precipitation")
pPrecip
## `geom_smooth()` using formula = 'y ~ x'
# Most flights occur with little or no precipitation. Higher precipitation shows a small increase in delays based on the upward trend line.
pPressure = ggplot(dfDelayedCutoff, aes(x = pressure, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Sea Level Pressure (mb)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Pressure")
pPressure
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3025 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3025 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Delays happen at all pressure levels. Pressure does not appear to strongly affect delays.
pVisibility = ggplot(dfDelayedCutoff, aes(x = visib, y = dep_delay)) +
geom_point() +
geom_smooth(method = "lm") +
labs(x = "Visibility (miles)", y = "Departure Delay (minutes)",
title = "Departure Delay vs. Visibility")
pVisibility
## `geom_smooth()` using formula = 'y ~ x'
# Most flights have full visibility (10 miles). Delays can be higher when visibility drops but the overall pattern is weak.
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
library(ggplot2)
pCTemp = ggplot(dfCanceled, aes(x = temp)) + geom_histogram() + labs(x = "Temperature (F)")
pCDewp = ggplot(dfCanceled, aes(x = dewp)) + geom_histogram() + labs(x = "Dew Point (F)")
pCHumid = ggplot(dfCanceled, aes(x = humid)) + geom_histogram() + labs(x = "Humidity (%)")
pCWindDir = ggplot(dfCanceled, aes(x = wind_dir)) + geom_histogram() + labs(x = "Wind Direction (degrees)")
pCWindSpeed = ggplot(dfCanceled, aes(x = wind_speed)) + geom_histogram() + labs(x = "Wind Speed (mph)")
pCWindGust = ggplot(dfCanceled, aes(x = wind_gust)) + geom_histogram() + labs(x = "Wind Gust (mph)")
pCPrecip = ggplot(dfCanceled, aes(x = precip)) + geom_histogram() + labs(x = "Precipitation (inches)")
pCPressure = ggplot(dfCanceled, aes(x = pressure)) + geom_histogram() + labs(x = "Pressure (mb)")
pCVisib = ggplot(dfCanceled, aes(x = visib)) + geom_histogram() + labs(x = "Visibility (miles)")
plot_grid(
pCTemp, pCDewp, pCHumid,
pCWindDir, pCWindSpeed, pCWindGust, pCPrecip,
pCPressure, pCVisib,
ncol = 3
)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 167 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 5576 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2441 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
# The analysis focused on non-canceled flights with delays over 100 minutes. No strong correlation was found between departure delays and any weather variable. Since weather does not appear to be a strong predictor of delays in this subset, operational issues are the more likely cause of these delays, making this dataset a good starting point for further investigation.