library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tibble)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.5
## ✔ ggplot2   4.0.0     ✔ stringr   1.5.2
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
search()
##  [1] ".GlobalEnv"           "package:nycflights13" "package:lubridate"   
##  [4] "package:forcats"      "package:stringr"      "package:purrr"       
##  [7] "package:readr"        "package:tidyr"        "package:ggplot2"     
## [10] "package:tidyverse"    "package:tibble"       "package:dplyr"       
## [13] "package:stats"        "package:graphics"     "package:grDevices"   
## [16] "package:utils"        "package:datasets"     "package:methods"     
## [19] "Autoloads"            "package:base"
ls("package:nycflights13")
## [1] "airlines" "airports" "flights"  "planes"   "weather"
ls()
## character(0)
tFlights <- nycflights13::flights
tWeather <- nycflights13::weather
ls()
## [1] "tFlights" "tWeather"
head(tFlights, 10)
## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
head(tWeather, 10)
## # A tibble: 10 × 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
##  8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
##  9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
## 10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
## # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>
str(tFlights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary(tFlights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
str(tWeather)
## tibble [26,115 × 15] (S3: tbl_df/tbl/data.frame)
##  $ origin    : chr [1:26115] "EWR" "EWR" "EWR" "EWR" ...
##  $ year      : int [1:26115] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month     : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
##  $ hour      : int [1:26115] 1 2 3 4 5 6 7 8 9 10 ...
##  $ temp      : num [1:26115] 39 39 39 39.9 39 ...
##  $ dewp      : num [1:26115] 26.1 27 28 28 28 ...
##  $ humid     : num [1:26115] 59.4 61.6 64.4 62.2 64.4 ...
##  $ wind_dir  : num [1:26115] 270 250 240 250 260 240 240 250 260 260 ...
##  $ wind_speed: num [1:26115] 10.36 8.06 11.51 12.66 12.66 ...
##  $ wind_gust : num [1:26115] NA NA NA NA NA NA NA NA NA NA ...
##  $ precip    : num [1:26115] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pressure  : num [1:26115] 1012 1012 1012 1012 1012 ...
##  $ visib     : num [1:26115] 10 10 10 10 10 10 10 10 10 10 ...
##  $ time_hour : POSIXct[1:26115], format: "2013-01-01 01:00:00" "2013-01-01 02:00:00" ...
summary(tWeather)
##     origin               year          month             day       
##  Length:26115       Min.   :2013   Min.   : 1.000   Min.   : 1.00  
##  Class :character   1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00  
##  Mode  :character   Median :2013   Median : 7.000   Median :16.00  
##                     Mean   :2013   Mean   : 6.504   Mean   :15.68  
##                     3rd Qu.:2013   3rd Qu.: 9.000   3rd Qu.:23.00  
##                     Max.   :2013   Max.   :12.000   Max.   :31.00  
##                                                                    
##       hour            temp             dewp           humid       
##  Min.   : 0.00   Min.   : 10.94   Min.   :-9.94   Min.   : 12.74  
##  1st Qu.: 6.00   1st Qu.: 39.92   1st Qu.:26.06   1st Qu.: 47.05  
##  Median :11.00   Median : 55.40   Median :42.08   Median : 61.79  
##  Mean   :11.49   Mean   : 55.26   Mean   :41.44   Mean   : 62.53  
##  3rd Qu.:17.00   3rd Qu.: 69.98   3rd Qu.:57.92   3rd Qu.: 78.79  
##  Max.   :23.00   Max.   :100.04   Max.   :78.08   Max.   :100.00  
##                  NA's   :1        NA's   :1       NA's   :1       
##     wind_dir       wind_speed         wind_gust         precip        
##  Min.   :  0.0   Min.   :   0.000   Min.   :16.11   Min.   :0.000000  
##  1st Qu.:120.0   1st Qu.:   6.905   1st Qu.:20.71   1st Qu.:0.000000  
##  Median :220.0   Median :  10.357   Median :24.17   Median :0.000000  
##  Mean   :199.8   Mean   :  10.517   Mean   :25.49   Mean   :0.004469  
##  3rd Qu.:290.0   3rd Qu.:  13.809   3rd Qu.:28.77   3rd Qu.:0.000000  
##  Max.   :360.0   Max.   :1048.361   Max.   :66.75   Max.   :1.210000  
##  NA's   :460     NA's   :4          NA's   :20778                     
##     pressure          visib          time_hour                  
##  Min.   : 983.8   Min.   : 0.000   Min.   :2013-01-01 01:00:00  
##  1st Qu.:1012.9   1st Qu.:10.000   1st Qu.:2013-04-01 21:30:00  
##  Median :1017.6   Median :10.000   Median :2013-07-01 14:00:00  
##  Mean   :1017.9   Mean   : 9.255   Mean   :2013-07-01 18:26:37  
##  3rd Qu.:1023.0   3rd Qu.:10.000   3rd Qu.:2013-09-30 13:00:00  
##  Max.   :1042.1   Max.   :10.000   Max.   :2013-12-30 18:00:00  
##  NA's   :2729
# tFlights columns used:
# year, month, day, dep_time, arr_time, sched_dep_time, sched_arr_time, dep_delay,
# arr_delay, carrier, flight, tailnum, origin, dest, air_time, distance, hour, minute, time_hour

# tWeather columns used:
# origin, year, month, day, hour, temp, dewp, humid, wind_dir, wind_speed,
# wind_gust, precip, pressure, visib, time_hour
tFlightWeather <- merge(tFlights, tWeather, by = c("origin", "time_hour"))
str(tFlightWeather)
## 'data.frame':    335220 obs. of  32 variables:
##  $ origin        : chr  "EWR" "EWR" "EWR" "EWR" ...
##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
##  $ year.x        : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month.x       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day.x         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int  517 554 732 629 624 643 644 643 601 559 ...
##  $ sched_dep_time: int  515 558 645 630 630 646 636 645 600 600 ...
##  $ dep_delay     : num  2 -4 47 -1 -6 -3 8 -2 1 -1 ...
##  $ arr_time      : int  830 740 1011 824 909 922 931 837 844 854 ...
##  $ sched_arr_time: int  819 728 941 833 840 940 940 848 850 902 ...
##  $ arr_delay     : num  11 12 30 -9 29 -18 -9 -11 -6 -8 ...
##  $ carrier       : chr  "UA" "UA" "UA" "US" ...
##  $ flight        : int  1545 1696 1111 1019 4626 556 1701 926 343 1187 ...
##  $ tailnum       : chr  "N14228" "N39463" "N37456" "N426US" ...
##  $ dest          : chr  "IAH" "ORD" "MCO" "CLT" ...
##  $ air_time      : num  227 150 145 91 190 146 151 91 147 337 ...
##  $ distance      : num  1400 719 937 529 1008 ...
##  $ hour.x        : num  5 5 6 6 6 6 6 6 6 6 ...
##  $ minute        : num  15 58 45 30 30 46 36 45 0 0 ...
##  $ year.y        : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month.y       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day.y         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ hour.y        : int  5 5 6 6 6 6 6 6 6 6 ...
##  $ temp          : num  39 39 37.9 37.9 37.9 ...
##  $ dewp          : num  28 28 28 28 28 ...
##  $ humid         : num  64.4 64.4 67.2 67.2 67.2 ...
##  $ wind_dir      : num  260 260 240 240 240 240 240 240 240 240 ...
##  $ wind_speed    : num  12.7 12.7 11.5 11.5 11.5 ...
##  $ wind_gust     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ precip        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pressure      : num  1012 1012 1012 1012 1012 ...
##  $ visib         : num  10 10 10 10 10 10 10 10 10 10 ...
nrow(tFlights)
## [1] 336776
nrow(tWeather)
## [1] 26115
nrow(tFlightWeather)
## [1] 335220
nrow(tFlights) - nrow(tFlightWeather)
## [1] 1556
missingFlights = anti_join(tFlights, tWeather, by = c("origin", "time_hour"))
missingFlights
## # A tibble: 1,556 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1     1153           1200        -7     1450           1529
##  2  2013     1     1     1154           1200        -6     1253           1306
##  3  2013     1     1     1155           1200        -5     1517           1510
##  4  2013     1     1     1155           1200        -5     1312           1315
##  5  2013     1     1     1157           1200        -3     1452           1456
##  6  2013     1     1     1158           1200        -2     1256           1300
##  7  2013     1     1     1200           1200         0     1408           1356
##  8  2013     1     1     1202           1207        -5     1318           1314
##  9  2013     1     1     1203           1205        -2     1501           1437
## 10  2013     1     1     1203           1200         3     1519           1545
## # ℹ 1,546 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# The merge kept only rows where both tFlights and tWeather share the same origin and time_hour. All others were dropped.
# No automatic factor conversion occurs in R >= 4.0.
getRversion()
## [1] '4.5.2'
# 1. Columns with .x and .y: year, month, day, hour

# 2. year, month, and day are redundant and one version can be dropped. hour differs in data type between the two tibbles.
identical(tFlightWeather$year.x, tFlightWeather$year.y)
## [1] TRUE
identical(tFlightWeather$month.x, tFlightWeather$month.y)
## [1] TRUE
identical(tFlightWeather$day.x, tFlightWeather$day.y)
## [1] TRUE
identical(tFlightWeather$hour.x, tFlightWeather$hour.y)
## [1] FALSE
# 3. origin and time_hour were used as merge keys, so R keeps only one version of each automatically.
dfMyFlightsWeather <- tFlightWeather %>%
  select(!c(year.y, month.y, day.y, hour.y))

dfMyFlightsWeather <- dfMyFlightsWeather %>%
  rename(
    year = year.x,
    month = month.x,
    day = day.x,
    hour = hour.x
  )

str(dfMyFlightsWeather)
## 'data.frame':    335220 obs. of  28 variables:
##  $ origin        : chr  "EWR" "EWR" "EWR" "EWR" ...
##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int  517 554 732 629 624 643 644 643 601 559 ...
##  $ sched_dep_time: int  515 558 645 630 630 646 636 645 600 600 ...
##  $ dep_delay     : num  2 -4 47 -1 -6 -3 8 -2 1 -1 ...
##  $ arr_time      : int  830 740 1011 824 909 922 931 837 844 854 ...
##  $ sched_arr_time: int  819 728 941 833 840 940 940 848 850 902 ...
##  $ arr_delay     : num  11 12 30 -9 29 -18 -9 -11 -6 -8 ...
##  $ carrier       : chr  "UA" "UA" "UA" "US" ...
##  $ flight        : int  1545 1696 1111 1019 4626 556 1701 926 343 1187 ...
##  $ tailnum       : chr  "N14228" "N39463" "N37456" "N426US" ...
##  $ dest          : chr  "IAH" "ORD" "MCO" "CLT" ...
##  $ air_time      : num  227 150 145 91 190 146 151 91 147 337 ...
##  $ distance      : num  1400 719 937 529 1008 ...
##  $ hour          : num  5 5 6 6 6 6 6 6 6 6 ...
##  $ minute        : num  15 58 45 30 30 46 36 45 0 0 ...
##  $ temp          : num  39 39 37.9 37.9 37.9 ...
##  $ dewp          : num  28 28 28 28 28 ...
##  $ humid         : num  64.4 64.4 67.2 67.2 67.2 ...
##  $ wind_dir      : num  260 260 240 240 240 240 240 240 240 240 ...
##  $ wind_speed    : num  12.7 12.7 11.5 11.5 11.5 ...
##  $ wind_gust     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ precip        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ pressure      : num  1012 1012 1012 1012 1012 ...
##  $ visib         : num  10 10 10 10 10 10 10 10 10 10 ...
pMyFlightsWeather = ggplot(dfMyFlightsWeather, aes(x = dep_delay)) + geom_histogram()
pMyFlightsWeather
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 8227 rows containing non-finite outside the scale range
## (`stat_bin()`).

summary(dfMyFlightsWeather$dep_delay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -43.00   -5.00   -2.00   12.64   11.00 1301.00    8227
# The average departure delay was 12.64 minutes
# The median departure delay was -2 minutes (2 minutes early)
# 25% of flights departed 5 minutes early
# 75% of flights were delayed no more than 11 minutes
# 25% of flights were delayed more than 11 minutes
dfCanceled <- dfMyFlightsWeather %>%
  filter(is.na(dep_delay))

pCanceled <- ggplot(dfCanceled, aes(x = dep_delay)) + geom_histogram()
pCanceled
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 8227 rows containing non-finite outside the scale range
## (`stat_bin()`).

# The histogram is empty because all canceled flights have NA in dep_delay. R has no numeric values to plot.
delayCutoff <- 100
dfDelayedCutoff <- dfMyFlightsWeather %>%
  filter(!is.na(dep_delay) & dep_delay > delayCutoff)

pDelayedCutoff <- ggplot(dfDelayedCutoff, aes(x = dep_delay)) + geom_histogram()
pDelayedCutoff
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

summary(dfDelayedCutoff$dep_delay)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   101.0   119.0   145.0   166.1   190.0  1301.0
pTemperature = ggplot(dfDelayedCutoff, aes(x = temp, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Temperature (F)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Temperature")
pTemperature
## `geom_smooth()` using formula = 'y ~ x'

# Delays are spread across all temperatures. No strong trend; a slight upward slope may suggest warmer temps have marginally higher delays but the pattern is weak.
pTemperatureColor = ggplot(dfDelayedCutoff, aes(x = temp, y = dep_delay, color = origin)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Temperature (F)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Temperature by Origin")
pTemperatureColor
## `geom_smooth()` using formula = 'y ~ x'

# No strong trend overall. EWR, LGA, and JFK show similar scatter patterns with no major differences by airport.
pDewPoint = ggplot(dfDelayedCutoff, aes(x = dewp, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Dew Point (F)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Dew Point")
pDewPoint
## `geom_smooth()` using formula = 'y ~ x'

# Similar to temperature -- delays are scattered across dew points with no strong trend.
pHumidity = ggplot(dfDelayedCutoff, aes(x = humid, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Relative Humidity (%)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Humidity")
pHumidity
## `geom_smooth()` using formula = 'y ~ x'

# Delays happen at all humidity levels. No strong trend, though very high humidity shows more variation in delays.
pWindDir = ggplot(dfDelayedCutoff, aes(x = wind_dir, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Wind Direction (degrees)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Wind Direction")
pWindDir
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 358 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 358 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Delays are spread across all wind directions. Direction alone does not strongly affect delays.
pWindSpeed = ggplot(dfDelayedCutoff, aes(x = wind_speed, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Wind Speed (mph)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Wind Speed")
pWindSpeed
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

# Most delays happen at normal wind speeds under 20 mph. Extreme wind speeds are rare but can be associated with larger delays.
pWindGust = ggplot(dfDelayedCutoff, aes(x = wind_gust, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Wind Gust (mph)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Wind Gust")
pWindGust
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 10050 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 10050 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Many flights have no recorded gusts. When gusts exist, they may link with higher delays but the pattern is not strong.

.

pPrecip = ggplot(dfDelayedCutoff, aes(x = precip, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Precipitation (inches)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Precipitation")
pPrecip
## `geom_smooth()` using formula = 'y ~ x'

# Most flights occur with little or no precipitation. Higher precipitation shows a small increase in delays based on the upward trend line.
pPressure = ggplot(dfDelayedCutoff, aes(x = pressure, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Sea Level Pressure (mb)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Pressure")
pPressure
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3025 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3025 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Delays happen at all pressure levels. Pressure does not appear to strongly affect delays.
pVisibility = ggplot(dfDelayedCutoff, aes(x = visib, y = dep_delay)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(x = "Visibility (miles)", y = "Departure Delay (minutes)",
       title = "Departure Delay vs. Visibility")
pVisibility
## `geom_smooth()` using formula = 'y ~ x'

# Most flights have full visibility (10 miles). Delays can be higher when visibility drops but the overall pattern is weak.
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(ggplot2)
pCTemp = ggplot(dfCanceled, aes(x = temp)) + geom_histogram() + labs(x = "Temperature (F)")
pCDewp = ggplot(dfCanceled, aes(x = dewp)) + geom_histogram() + labs(x = "Dew Point (F)")
pCHumid = ggplot(dfCanceled, aes(x = humid)) + geom_histogram() + labs(x = "Humidity (%)")
pCWindDir = ggplot(dfCanceled, aes(x = wind_dir)) + geom_histogram() + labs(x = "Wind Direction (degrees)")
pCWindSpeed = ggplot(dfCanceled, aes(x = wind_speed)) + geom_histogram() + labs(x = "Wind Speed (mph)")
pCWindGust = ggplot(dfCanceled, aes(x = wind_gust)) + geom_histogram() + labs(x = "Wind Gust (mph)")
pCPrecip = ggplot(dfCanceled, aes(x = precip)) + geom_histogram() + labs(x = "Precipitation (inches)")
pCPressure = ggplot(dfCanceled, aes(x = pressure)) + geom_histogram() + labs(x = "Pressure (mb)")
pCVisib = ggplot(dfCanceled, aes(x = visib)) + geom_histogram() + labs(x = "Visibility (miles)")

plot_grid(
  pCTemp, pCDewp, pCHumid,
  pCWindDir, pCWindSpeed, pCWindGust, pCPrecip,
  pCPressure, pCVisib,
  ncol = 3
)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 167 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 5576 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2441 rows containing non-finite outside the scale range
## (`stat_bin()`).
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

# The analysis focused on non-canceled flights with delays over 100 minutes. No strong correlation was found between departure delays and any weather variable. Since weather does not appear to be a strong predictor of delays in this subset, operational issues are the more likely cause of these delays, making this dataset a good starting point for further investigation.