# Reading in the data and taking a glimpse of data before manipulation. Summary shows categories with heavy NAs
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
before2009 = read_csv("PricesBefore2009.csv")
## New names:
## Rows: 1933 Columns: 82
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf... dbl
## (39): ...1, Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCo...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
str("PricesBefore2009")
##  chr "PricesBefore2009"
head(before2009)
## # A tibble: 6 × 82
##    ...1    Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
##   <dbl> <dbl>      <dbl> <chr>          <dbl>   <dbl> <chr>  <chr> <chr>   
## 1     1     1         60 RL                65    8450 Pave   <NA>  Reg     
## 2     2     2         20 RL                80    9600 Pave   <NA>  Reg     
## 3     3     3         60 RL                68   11250 Pave   <NA>  IR1     
## 4     4     4         70 RL                60    9550 Pave   <NA>  IR1     
## 5     5     5         60 RL                84   14260 Pave   <NA>  IR1     
## 6     6     7         20 RL                75   10084 Pave   <NA>  Reg     
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## #   LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## #   BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## #   YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## #   Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## #   ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## #   BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
summary(before2009)
##       ...1            Id         MSSubClass       MSZoning        
##  Min.   :   1   Min.   :   1   Min.   : 20.00   Length:1933       
##  1st Qu.: 484   1st Qu.: 733   1st Qu.: 20.00   Class :character  
##  Median : 967   Median :1953   Median : 50.00   Mode  :character  
##  Mean   : 967   Mean   :1594   Mean   : 57.81                     
##  3rd Qu.:1450   3rd Qu.:2436   3rd Qu.: 70.00                     
##  Max.   :1933   Max.   :2919   Max.   :190.00                     
##                                                                   
##   LotFrontage        LotArea          Street             Alley          
##  Min.   : 21.00   Min.   :  1470   Length:1933        Length:1933       
##  1st Qu.: 58.00   1st Qu.:  7446   Class :character   Class :character  
##  Median : 68.00   Median :  9462   Mode  :character   Mode  :character  
##  Mean   : 69.38   Mean   : 10306                                        
##  3rd Qu.: 80.00   3rd Qu.: 11643                                        
##  Max.   :313.00   Max.   :164660                                        
##  NA's   :317                                                            
##    LotShape         LandContour         Utilities          LotConfig        
##  Length:1933        Length:1933        Length:1933        Length:1933       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   LandSlope         Neighborhood        Condition1         Condition2       
##  Length:1933        Length:1933        Length:1933        Length:1933       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BldgType          HouseStyle         OverallQual      OverallCond  
##  Length:1933        Length:1933        Min.   : 1.000   Min.   :1.00  
##  Class :character   Class :character   1st Qu.: 5.000   1st Qu.:5.00  
##  Mode  :character   Mode  :character   Median : 6.000   Median :5.00  
##                                        Mean   : 6.108   Mean   :5.57  
##                                        3rd Qu.: 7.000   3rd Qu.:6.00  
##                                        Max.   :10.000   Max.   :9.00  
##                                                                       
##    YearBuilt     YearRemodAdd   RoofStyle           RoofMatl        
##  Min.   :1872   Min.   :1950   Length:1933        Length:1933       
##  1st Qu.:1953   1st Qu.:1965   Class :character   Class :character  
##  Median :1972   Median :1993   Mode  :character   Mode  :character  
##  Mean   :1971   Mean   :1984                                        
##  3rd Qu.:2001   3rd Qu.:2004                                        
##  Max.   :2008   Max.   :2009                                        
##                                                                     
##  Exterior1st        Exterior2nd         MasVnrType          MasVnrArea    
##  Length:1933        Length:1933        Length:1933        Min.   :   0.0  
##  Class :character   Class :character   Class :character   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Mode  :character   Median :   0.0  
##                                                           Mean   : 105.2  
##                                                           3rd Qu.: 168.0  
##                                                           Max.   :1600.0  
##                                                           NA's   :18      
##   ExterQual          ExterCond          Foundation          BsmtQual        
##  Length:1933        Length:1933        Length:1933        Length:1933       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    BsmtCond         BsmtExposure       BsmtFinType1         BsmtFinSF1    
##  Length:1933        Length:1933        Length:1933        Min.   :   0.0  
##  Class :character   Class :character   Class :character   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Mode  :character   Median : 360.5  
##                                                           Mean   : 435.9  
##                                                           3rd Qu.: 733.0  
##                                                           Max.   :5644.0  
##                                                           NA's   :1       
##  BsmtFinType2         BsmtFinSF2        BsmtUnfSF       TotalBsmtSF    
##  Length:1933        Min.   :   0.00   Min.   :   0.0   Min.   :   0.0  
##  Class :character   1st Qu.:   0.00   1st Qu.: 226.0   1st Qu.: 796.8  
##  Mode  :character   Median :   0.00   Median : 476.5   Median : 988.0  
##                     Mean   :  48.96   Mean   : 569.1   Mean   :1053.9  
##                     3rd Qu.:   0.00   3rd Qu.: 816.0   3rd Qu.:1296.0  
##                     Max.   :1474.00   Max.   :2153.0   Max.   :6110.0  
##                     NA's   :1         NA's   :1        NA's   :1       
##    Heating           HeatingQC          CentralAir         Electrical       
##  Length:1933        Length:1933        Length:1933        Length:1933       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    X1stFlrSF      X2ndFlrSF     LowQualFinSF     GrLivArea     BsmtFullBath   
##  Min.   : 334   Min.   :   0   Min.   :  0.0   Min.   : 334   Min.   :0.0000  
##  1st Qu.: 886   1st Qu.:   0   1st Qu.:  0.0   1st Qu.:1118   1st Qu.:0.0000  
##  Median :1084   Median :   0   Median :  0.0   Median :1440   Median :0.0000  
##  Mean   :1161   Mean   : 342   Mean   :  4.3   Mean   :1507   Mean   :0.4138  
##  3rd Qu.:1383   3rd Qu.: 720   3rd Qu.:  0.0   3rd Qu.:1760   3rd Qu.:1.0000  
##  Max.   :5095   Max.   :2065   Max.   :697.0   Max.   :5642   Max.   :2.0000  
##                                                               NA's   :2       
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.06525   Mean   :1.568   Mean   :0.3787   Mean   :2.872  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##  NA's   :2                                                         
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1933        Min.   : 2.000   Length:1933       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.039                      Mean   : 6.474                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :2.000                      Max.   :15.000                     
##                                                                        
##    Fireplaces     FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.0000   Length:1933        Length:1933        Min.   :1895  
##  1st Qu.:0.0000   Class :character   Class :character   1st Qu.:1960  
##  Median :1.0000   Mode  :character   Mode  :character   Median :1979  
##  Mean   :0.5934                                         Mean   :1978  
##  3rd Qu.:1.0000                                         3rd Qu.:2002  
##  Max.   :4.0000                                         Max.   :2207  
##                                                         NA's   :107   
##  GarageFinish         GarageCars     GarageArea      GarageQual       
##  Length:1933        Min.   :0.00   Min.   :   0.0   Length:1933       
##  Class :character   1st Qu.:1.00   1st Qu.: 318.8   Class :character  
##  Mode  :character   Median :2.00   Median : 478.0   Mode  :character  
##                     Mean   :1.77   Mean   : 472.9                     
##                     3rd Qu.:2.00   3rd Qu.: 576.0                     
##                     Max.   :4.00   Max.   :1488.0                     
##                     NA's   :1      NA's   :1                          
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1933        Length:1933        Min.   :   0.0   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:   0.0   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :   0.0   Median : 28.00  
##                                        Mean   :  92.5   Mean   : 49.12  
##                                        3rd Qu.: 168.0   3rd Qu.: 72.00  
##                                        Max.   :1424.0   Max.   :742.00  
##                                                                         
##  EnclosedPorch       X3SsnPorch       ScreenPorch        PoolArea    
##  Min.   :   0.00   Min.   :  0.000   Min.   :  0.00   Min.   :  0.0  
##  1st Qu.:   0.00   1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.0  
##  Median :   0.00   Median :  0.000   Median :  0.00   Median :  0.0  
##  Mean   :  23.04   Mean   :  2.259   Mean   : 16.32   Mean   :  3.4  
##  3rd Qu.:   0.00   3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.0  
##  Max.   :1012.00   Max.   :407.000   Max.   :576.00   Max.   :800.0  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1933        Length:1933        Length:1933        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   52.77  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :17000.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1933        Length:1933       
##  1st Qu.: 5.000   1st Qu.:2006   Class :character   Class :character  
##  Median : 6.000   Median :2007   Mode  :character   Mode  :character  
##  Mean   : 6.426   Mean   :2007                                        
##  3rd Qu.: 8.000   3rd Qu.:2008                                        
##  Max.   :12.000   Max.   :2008                                        
##                                                                       
##    SalePrice       
##  Min.   :   454.5  
##  1st Qu.:129000.0  
##  Median :163611.2  
##  Mean   :181127.7  
##  3rd Qu.:214993.0  
##  Max.   :755000.0  
##  NA's   :23
# Convert MSSubClass, OverallQual, and OverallCond to factors since they are currently categories and not numerical order
before2009$MSSubClass = as.factor(before2009$MSSubClass)
before2009$OverallQual = as.factor(before2009$OverallQual)
before2009$OverallCond = as.factor(before2009$OverallCond)

# OverallQual and OverallCond are ordinal (rating scales)
before2009$OverallQual <- ordered(before2009$OverallQual)
before2009$OverallCond <- ordered(before2009$OverallCond)

# Check structure to verify columns are now factors
str(before2009)
## spc_tbl_ [1,933 × 82] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ...1         : num [1:1933] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Id           : num [1:1933] 1 2 3 4 5 7 9 10 11 12 ...
##  $ MSSubClass   : Factor w/ 16 levels "20","30","40",..: 6 1 6 7 6 1 5 16 1 6 ...
##  $ MSZoning     : chr [1:1933] "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : num [1:1933] 65 80 68 60 84 75 51 50 70 85 ...
##  $ LotArea      : num [1:1933] 8450 9600 11250 9550 14260 ...
##  $ Street       : chr [1:1933] "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr [1:1933] NA NA NA NA ...
##  $ LotShape     : chr [1:1933] "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr [1:1933] "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr [1:1933] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr [1:1933] "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr [1:1933] "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr [1:1933] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr [1:1933] "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr [1:1933] "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr [1:1933] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr [1:1933] "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 7 6 7 7 8 8 7 5 5 9 ...
##  $ OverallCond  : Ord.factor w/ 9 levels "1"<"2"<"3"<"4"<..: 5 8 5 5 5 5 5 6 5 5 ...
##  $ YearBuilt    : num [1:1933] 2003 1976 2001 1915 2000 ...
##  $ YearRemodAdd : num [1:1933] 2003 1976 2002 1970 2000 ...
##  $ RoofStyle    : chr [1:1933] "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr [1:1933] "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr [1:1933] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr [1:1933] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr [1:1933] "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : num [1:1933] 196 0 162 0 350 186 0 0 0 286 ...
##  $ ExterQual    : chr [1:1933] "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr [1:1933] "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr [1:1933] "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr [1:1933] "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr [1:1933] "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr [1:1933] "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr [1:1933] "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : num [1:1933] 706 978 486 216 655 ...
##  $ BsmtFinType2 : chr [1:1933] "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
##  $ BsmtUnfSF    : num [1:1933] 150 284 434 540 490 317 952 140 134 177 ...
##  $ TotalBsmtSF  : num [1:1933] 856 1262 920 756 1145 ...
##  $ Heating      : chr [1:1933] "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr [1:1933] "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr [1:1933] "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr [1:1933] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : num [1:1933] 856 1262 920 961 1145 ...
##  $ X2ndFlrSF    : num [1:1933] 854 0 866 756 1053 ...
##  $ LowQualFinSF : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : num [1:1933] 1710 1262 1786 1717 2198 ...
##  $ BsmtFullBath : num [1:1933] 1 0 1 1 1 1 0 1 1 1 ...
##  $ BsmtHalfBath : num [1:1933] 0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : num [1:1933] 2 2 2 1 2 2 2 1 1 3 ...
##  $ HalfBath     : num [1:1933] 1 0 1 0 1 0 0 0 0 0 ...
##  $ BedroomAbvGr : num [1:1933] 3 3 3 3 4 3 2 2 3 4 ...
##  $ KitchenAbvGr : num [1:1933] 1 1 1 1 1 1 2 2 1 1 ...
##  $ KitchenQual  : chr [1:1933] "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : num [1:1933] 8 6 6 7 9 7 8 5 5 11 ...
##  $ Functional   : chr [1:1933] "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : num [1:1933] 0 1 1 1 1 1 2 2 0 2 ...
##  $ FireplaceQu  : chr [1:1933] NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr [1:1933] "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : num [1:1933] 2003 1976 2001 1998 2000 ...
##  $ GarageFinish : chr [1:1933] "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : num [1:1933] 2 2 2 3 3 2 2 1 1 3 ...
##  $ GarageArea   : num [1:1933] 548 460 608 642 836 636 468 205 384 736 ...
##  $ GarageQual   : chr [1:1933] "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr [1:1933] "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr [1:1933] "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : num [1:1933] 0 298 0 0 192 255 90 0 0 147 ...
##  $ OpenPorchSF  : num [1:1933] 61 0 42 35 84 57 0 4 0 21 ...
##  $ EnclosedPorch: num [1:1933] 0 0 0 272 0 0 205 0 0 0 ...
##  $ X3SsnPorch   : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
##  $ ScreenPorch  : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr [1:1933] NA NA NA NA ...
##  $ Fence        : chr [1:1933] NA NA NA NA ...
##  $ MiscFeature  : chr [1:1933] NA NA NA NA ...
##  $ MiscVal      : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
##  $ MoSold       : num [1:1933] 2 5 9 2 12 8 4 1 2 7 ...
##  $ YrSold       : num [1:1933] 2008 2007 2008 2006 2008 ...
##  $ SaleType     : chr [1:1933] "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr [1:1933] "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : num [1:1933] 208500 181500 223500 140000 250000 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ...1 = col_double(),
##   ..   Id = col_double(),
##   ..   MSSubClass = col_double(),
##   ..   MSZoning = col_character(),
##   ..   LotFrontage = col_double(),
##   ..   LotArea = col_double(),
##   ..   Street = col_character(),
##   ..   Alley = col_character(),
##   ..   LotShape = col_character(),
##   ..   LandContour = col_character(),
##   ..   Utilities = col_character(),
##   ..   LotConfig = col_character(),
##   ..   LandSlope = col_character(),
##   ..   Neighborhood = col_character(),
##   ..   Condition1 = col_character(),
##   ..   Condition2 = col_character(),
##   ..   BldgType = col_character(),
##   ..   HouseStyle = col_character(),
##   ..   OverallQual = col_double(),
##   ..   OverallCond = col_double(),
##   ..   YearBuilt = col_double(),
##   ..   YearRemodAdd = col_double(),
##   ..   RoofStyle = col_character(),
##   ..   RoofMatl = col_character(),
##   ..   Exterior1st = col_character(),
##   ..   Exterior2nd = col_character(),
##   ..   MasVnrType = col_character(),
##   ..   MasVnrArea = col_double(),
##   ..   ExterQual = col_character(),
##   ..   ExterCond = col_character(),
##   ..   Foundation = col_character(),
##   ..   BsmtQual = col_character(),
##   ..   BsmtCond = col_character(),
##   ..   BsmtExposure = col_character(),
##   ..   BsmtFinType1 = col_character(),
##   ..   BsmtFinSF1 = col_double(),
##   ..   BsmtFinType2 = col_character(),
##   ..   BsmtFinSF2 = col_double(),
##   ..   BsmtUnfSF = col_double(),
##   ..   TotalBsmtSF = col_double(),
##   ..   Heating = col_character(),
##   ..   HeatingQC = col_character(),
##   ..   CentralAir = col_character(),
##   ..   Electrical = col_character(),
##   ..   X1stFlrSF = col_double(),
##   ..   X2ndFlrSF = col_double(),
##   ..   LowQualFinSF = col_double(),
##   ..   GrLivArea = col_double(),
##   ..   BsmtFullBath = col_double(),
##   ..   BsmtHalfBath = col_double(),
##   ..   FullBath = col_double(),
##   ..   HalfBath = col_double(),
##   ..   BedroomAbvGr = col_double(),
##   ..   KitchenAbvGr = col_double(),
##   ..   KitchenQual = col_character(),
##   ..   TotRmsAbvGrd = col_double(),
##   ..   Functional = col_character(),
##   ..   Fireplaces = col_double(),
##   ..   FireplaceQu = col_character(),
##   ..   GarageType = col_character(),
##   ..   GarageYrBlt = col_double(),
##   ..   GarageFinish = col_character(),
##   ..   GarageCars = col_double(),
##   ..   GarageArea = col_double(),
##   ..   GarageQual = col_character(),
##   ..   GarageCond = col_character(),
##   ..   PavedDrive = col_character(),
##   ..   WoodDeckSF = col_double(),
##   ..   OpenPorchSF = col_double(),
##   ..   EnclosedPorch = col_double(),
##   ..   X3SsnPorch = col_double(),
##   ..   ScreenPorch = col_double(),
##   ..   PoolArea = col_double(),
##   ..   PoolQC = col_character(),
##   ..   Fence = col_character(),
##   ..   MiscFeature = col_character(),
##   ..   MiscVal = col_double(),
##   ..   MoSold = col_double(),
##   ..   YrSold = col_double(),
##   ..   SaleType = col_character(),
##   ..   SaleCondition = col_character(),
##   ..   SalePrice = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Compute number of NAs in each column
na_counts = colSums(is.na(before2009))

# Get column names
col_names = colnames(before2009)

# Combine two vectors side by side and NA counts into a dataframe called beforeNAs
beforeNAs = data.frame(Data_Fields = col_names, NAs = na_counts)

# View 10 rows
head(beforeNAs, 10)
##             Data_Fields  NAs
## ...1               ...1    0
## Id                   Id    0
## MSSubClass   MSSubClass    0
## MSZoning       MSZoning    3
## LotFrontage LotFrontage  317
## LotArea         LotArea    0
## Street           Street    0
## Alley             Alley 1797
## LotShape       LotShape    0
## LandContour LandContour    0
# Sort the NA counts in descending order to see which columns have the most missing data
beforeNAs = beforeNAs[order(-beforeNAs$NAs), ]

# Display the top 10 columns with the most missing values
head(beforeNAs, 10)
##               Data_Fields  NAs
## PoolQC             PoolQC 1923
## MiscFeature   MiscFeature 1871
## Alley               Alley 1797
## Fence               Fence 1569
## FireplaceQu   FireplaceQu  938
## LotFrontage   LotFrontage  317
## GarageYrBlt   GarageYrBlt  107
## GarageFinish GarageFinish  107
## GarageQual     GarageQual  107
## GarageCond     GarageCond  107
# Drop columns (except SalePrice) that have more than 100 missing values
before2009 = before2009[, colSums(is.na(before2009)) <= 100 | colnames(before2009) == "SalePrice"]

# Drop Id and Utilities columns
before2009 = subset(before2009, select = -c(Id, Utilities))

# Print first 10 rows to verify
head(before2009, 10)
## # A tibble: 10 × 69
##     ...1 MSSubClass MSZoning LotArea Street LotShape LandContour LotConfig
##    <dbl> <fct>      <chr>      <dbl> <chr>  <chr>    <chr>       <chr>    
##  1     1 60         RL          8450 Pave   Reg      Lvl         Inside   
##  2     2 20         RL          9600 Pave   Reg      Lvl         FR2      
##  3     3 60         RL         11250 Pave   IR1      Lvl         Inside   
##  4     4 70         RL          9550 Pave   IR1      Lvl         Corner   
##  5     5 60         RL         14260 Pave   IR1      Lvl         FR2      
##  6     6 20         RL         10084 Pave   Reg      Lvl         Inside   
##  7     7 50         RM          6120 Pave   Reg      Lvl         Inside   
##  8     8 190        RL          7420 Pave   Reg      Lvl         Corner   
##  9     9 20         RL         11200 Pave   Reg      Lvl         Inside   
## 10    10 60         RL         11924 Pave   IR1      Lvl         Inside   
## # ℹ 61 more variables: LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>,
## #   Condition2 <chr>, BldgType <chr>, HouseStyle <chr>, OverallQual <ord>,
## #   OverallCond <ord>, YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>,
## #   RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## #   MasVnrArea <dbl>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## #   BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## #   BsmtFinSF1 <dbl>, BsmtFinType2 <chr>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, …
# Conduct multiple linear regression with SalePrice as the response
regBefore2009 = lm(SalePrice ~ ., data = before2009)

# Print summary to verify results
summary(regBefore2009)
## 
## Call:
## lm(formula = SalePrice ~ ., data = before2009)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -178538   -4758     -10    4645  159265 
## 
## Coefficients: (3 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.979e+06  1.036e+06  -1.911 0.056228 .  
## ...1                 -2.001e-01  7.139e-01  -0.280 0.779316    
## MSSubClass30         -4.076e+02  2.885e+03  -0.141 0.887651    
## MSSubClass40          4.599e+01  8.916e+03   0.005 0.995885    
## MSSubClass45          6.516e+03  1.211e+04   0.538 0.590536    
## MSSubClass50          3.835e+03  5.220e+03   0.735 0.462620    
## MSSubClass60          2.177e+03  5.185e+03   0.420 0.674623    
## MSSubClass70          4.484e+03  5.356e+03   0.837 0.402595    
## MSSubClass75          1.954e+03  7.860e+03   0.249 0.803755    
## MSSubClass80         -1.103e+04  7.484e+03  -1.474 0.140670    
## MSSubClass85         -8.479e+03  5.963e+03  -1.422 0.155256    
## MSSubClass90         -1.286e+04  4.923e+03  -2.612 0.009077 ** 
## MSSubClass120        -9.343e+03  7.154e+03  -1.306 0.191759    
## MSSubClass150         3.250e+03  1.938e+04   0.168 0.866816    
## MSSubClass160        -4.939e+03  9.193e+03  -0.537 0.591129    
## MSSubClass180        -8.511e+03  1.003e+04  -0.848 0.396372    
## MSSubClass190        -6.635e+03  1.293e+04  -0.513 0.608027    
## MSZoningFV            4.122e+04  7.309e+03   5.640 2.01e-08 ***
## MSZoningRH            2.680e+04  7.555e+03   3.547 0.000401 ***
## MSZoningRL            3.068e+04  6.286e+03   4.880 1.16e-06 ***
## MSZoningRM            3.081e+04  5.906e+03   5.216 2.06e-07 ***
## LotArea               5.397e-01  7.872e-02   6.856 1.00e-11 ***
## StreetPave            2.893e+04  6.801e+03   4.253 2.23e-05 ***
## LotShapeIR2           5.045e+03  2.509e+03   2.010 0.044552 *  
## LotShapeIR3           7.957e+03  5.077e+03   1.567 0.117219    
## LotShapeReg           6.244e+02  9.637e+02   0.648 0.517100    
## LandContourHLS        1.128e+04  2.964e+03   3.806 0.000146 ***
## LandContourLow        4.217e+02  4.091e+03   0.103 0.917916    
## LandContourLvl        1.029e+04  2.187e+03   4.702 2.79e-06 ***
## LotConfigCulDSac      6.668e+03  1.971e+03   3.382 0.000736 ***
## LotConfigFR2         -8.933e+03  2.640e+03  -3.383 0.000734 ***
## LotConfigFR3         -1.354e+04  5.142e+03  -2.632 0.008562 ** 
## LotConfigInside      -3.401e+03  1.091e+03  -3.117 0.001859 ** 
## LandSlopeMod          1.105e+04  2.446e+03   4.519 6.65e-06 ***
## LandSlopeSev         -1.999e+04  7.348e+03  -2.721 0.006575 ** 
## NeighborhoodBlueste  -1.050e+04  9.841e+03  -1.067 0.286085    
## NeighborhoodBrDale   -2.398e+03  6.859e+03  -0.350 0.726695    
## NeighborhoodBrkSide  -9.373e+03  5.554e+03  -1.688 0.091661 .  
## NeighborhoodClearCr  -2.004e+04  5.877e+03  -3.410 0.000666 ***
## NeighborhoodCollgCr  -1.606e+04  4.337e+03  -3.704 0.000219 ***
## NeighborhoodCrawfor   6.379e+03  5.039e+03   1.266 0.205724    
## NeighborhoodEdwards  -2.396e+04  4.793e+03  -5.000 6.35e-07 ***
## NeighborhoodGilbert  -1.691e+04  4.613e+03  -3.665 0.000255 ***
## NeighborhoodIDOTRR   -1.664e+04  6.043e+03  -2.753 0.005963 ** 
## NeighborhoodMeadowV  -2.073e+04  6.895e+03  -3.007 0.002683 ** 
## NeighborhoodMitchel  -2.794e+04  4.856e+03  -5.753 1.05e-08 ***
## NeighborhoodNAmes    -2.077e+04  4.676e+03  -4.441 9.54e-06 ***
## NeighborhoodNoRidge   1.610e+04  5.123e+03   3.143 0.001703 ** 
## NeighborhoodNPkVill   8.668e+03  1.038e+04   0.835 0.403900    
## NeighborhoodNridgHt   1.428e+04  4.516e+03   3.161 0.001601 ** 
## NeighborhoodNWAmes   -2.351e+04  4.817e+03  -4.880 1.17e-06 ***
## NeighborhoodOldTown  -2.152e+04  5.558e+03  -3.871 0.000113 ***
## NeighborhoodSawyer   -1.595e+04  4.850e+03  -3.289 0.001028 ** 
## NeighborhoodSawyerW  -1.082e+04  4.746e+03  -2.280 0.022746 *  
## NeighborhoodSomerst  -1.343e+04  5.278e+03  -2.544 0.011049 *  
## NeighborhoodStoneBr   2.972e+04  5.140e+03   5.782 8.85e-09 ***
## NeighborhoodSWISU    -1.141e+04  5.950e+03  -1.918 0.055330 .  
## NeighborhoodTimber   -1.174e+04  4.844e+03  -2.424 0.015457 *  
## NeighborhoodVeenker  -3.820e+03  5.876e+03  -0.650 0.515734    
## Condition1Feedr       2.234e+03  3.065e+03   0.729 0.466271    
## Condition1Norm        1.271e+04  2.568e+03   4.950 8.20e-07 ***
## Condition1PosA        1.150e+04  5.847e+03   1.967 0.049411 *  
## Condition1PosN        7.208e+03  4.654e+03   1.549 0.121596    
## Condition1RRAe       -1.508e+04  4.834e+03  -3.121 0.001837 ** 
## Condition1RRAn        1.055e+04  3.974e+03   2.654 0.008022 ** 
## Condition1RRNe       -1.997e+03  8.856e+03  -0.225 0.821646    
## Condition1RRNn        7.427e+03  9.379e+03   0.792 0.428546    
## Condition2Feedr      -1.153e+04  1.047e+04  -1.101 0.270935    
## Condition2Norm       -4.704e+03  9.203e+03  -0.511 0.609289    
## Condition2PosA       -6.917e+03  1.457e+04  -0.475 0.635070    
## Condition2PosN       -2.458e+05  1.375e+04 -17.880  < 2e-16 ***
## Condition2RRAe       -1.128e+05  2.415e+04  -4.673 3.21e-06 ***
## Condition2RRAn       -1.028e+04  1.880e+04  -0.547 0.584608    
## Condition2RRNn       -6.878e+02  1.485e+04  -0.046 0.963068    
## BldgType2fmCon       -4.073e+03  1.295e+04  -0.315 0.753087    
## BldgTypeDuplex               NA         NA      NA       NA    
## BldgTypeTwnhs        -1.669e+04  7.715e+03  -2.163 0.030686 *  
## BldgTypeTwnhsE       -1.623e+04  7.127e+03  -2.277 0.022927 *  
## HouseStyle1.5Unf      7.616e+03  1.129e+04   0.675 0.499938    
## HouseStyle1Story      1.116e+04  5.275e+03   2.116 0.034466 *  
## HouseStyle2.5Fin     -1.326e+04  1.044e+04  -1.270 0.204150    
## HouseStyle2.5Unf     -9.051e+03  7.513e+03  -1.205 0.228491    
## HouseStyle2Story     -7.049e+03  5.112e+03  -1.379 0.168116    
## HouseStyleSFoyer      1.372e+04  6.885e+03   1.993 0.046394 *  
## HouseStyleSLvl        1.542e+04  8.062e+03   1.913 0.055967 .  
## OverallQual.L         7.995e+04  7.197e+03  11.109  < 2e-16 ***
## OverallQual.Q         3.879e+04  6.133e+03   6.324 3.29e-10 ***
## OverallQual.C         1.851e+04  4.894e+03   3.783 0.000161 ***
## OverallQual^4         7.991e+03  3.931e+03   2.033 0.042227 *  
## OverallQual^5         4.060e+03  3.105e+03   1.308 0.191135    
## OverallQual^6         1.497e+03  2.273e+03   0.659 0.510296    
## OverallQual^7         1.208e+03  1.502e+03   0.804 0.421441    
## OverallQual^8         4.453e+01  9.513e+02   0.047 0.962671    
## OverallCond.L         5.466e+04  1.154e+04   4.738 2.35e-06 ***
## OverallCond.Q        -4.500e+03  8.535e+03  -0.527 0.598101    
## OverallCond.C         3.524e+03  8.176e+03   0.431 0.666500    
## OverallCond^4        -5.331e+02  1.069e+04  -0.050 0.960235    
## OverallCond^5         1.907e+03  1.075e+04   0.177 0.859178    
## OverallCond^6        -2.832e+01  8.149e+03  -0.003 0.997227    
## OverallCond^7        -1.228e+03  4.731e+03  -0.260 0.795227    
## OverallCond^8         8.598e+02  2.167e+03   0.397 0.691597    
## YearBuilt             3.354e+02  5.063e+01   6.625 4.72e-11 ***
## YearRemodAdd          1.000e+02  3.359e+01   2.978 0.002949 ** 
## RoofStyleGable       -2.570e+03  8.788e+03  -0.292 0.770018    
## RoofStyleGambrel      7.252e+02  9.865e+03   0.074 0.941410    
## RoofStyleHip         -1.456e+03  8.835e+03  -0.165 0.869113    
## RoofStyleMansard      1.046e+04  1.114e+04   0.939 0.347895    
## RoofStyleShed         8.737e+04  1.545e+04   5.656 1.82e-08 ***
## RoofMatlCompShg       6.588e+05  2.077e+04  31.715  < 2e-16 ***
## RoofMatlMembran       7.337e+05  2.954e+04  24.839  < 2e-16 ***
## RoofMatlMetal         6.956e+05  2.924e+04  23.790  < 2e-16 ***
## RoofMatlRoll          6.540e+05  2.704e+04  24.185  < 2e-16 ***
## RoofMatlTar&Grv       6.638e+05  2.230e+04  29.765  < 2e-16 ***
## RoofMatlWdShake       6.405e+05  2.212e+04  28.956  < 2e-16 ***
## RoofMatlWdShngl       7.384e+05  2.191e+04  33.695  < 2e-16 ***
## Exterior1stAsphShn   -1.658e+04  2.350e+04  -0.706 0.480461    
## Exterior1stBrkComm   -1.030e+04  1.485e+04  -0.694 0.487916    
## Exterior1stBrkFace    7.470e+03  8.884e+03   0.841 0.400546    
## Exterior1stCemntBd   -1.161e+04  1.261e+04  -0.921 0.357418    
## Exterior1stHdBoard   -1.121e+04  8.818e+03  -1.271 0.203762    
## Exterior1stImStucc   -6.537e+04  1.881e+04  -3.476 0.000523 ***
## Exterior1stMetalSd   -1.660e+03  9.484e+03  -0.175 0.861119    
## Exterior1stPlywood   -1.569e+04  8.759e+03  -1.792 0.073375 .  
## Exterior1stStone     -1.689e+04  2.040e+04  -0.828 0.407636    
## Exterior1stStucco    -7.786e+03  9.777e+03  -0.796 0.425892    
## Exterior1stVinylSd   -1.640e+04  9.652e+03  -1.699 0.089498 .  
## Exterior1stWd Sdng   -9.763e+03  8.612e+03  -1.134 0.257119    
## Exterior1stWdShing   -5.095e+03  9.094e+03  -0.560 0.575345    
## Exterior2ndAsphShn    3.134e+03  1.528e+04   0.205 0.837558    
## Exterior2ndBrk Cmn    3.700e+03  1.461e+04   0.253 0.800113    
## Exterior2ndBrkFace    2.171e+02  9.677e+03   0.022 0.982108    
## Exterior2ndCmentBd    8.848e+03  1.297e+04   0.682 0.495339    
## Exterior2ndHdBoard    2.825e+03  9.223e+03   0.306 0.759403    
## Exterior2ndImStucc    3.346e+04  1.029e+04   3.250 0.001176 ** 
## Exterior2ndMetalSd   -1.437e+03  9.819e+03  -0.146 0.883684    
## Exterior2ndOther     -9.966e+03  1.908e+04  -0.522 0.601439    
## Exterior2ndPlywood    3.444e+03  9.003e+03   0.383 0.702090    
## Exterior2ndStone      9.449e+02  2.643e+04   0.036 0.971485    
## Exterior2ndStucco     3.391e+00  1.000e+04   0.000 0.999729    
## Exterior2ndVinylSd    1.063e+04  9.968e+03   1.066 0.286414    
## Exterior2ndWd Sdng    5.274e+03  9.047e+03   0.583 0.560025    
## Exterior2ndWd Shng   -3.541e+03  9.434e+03  -0.375 0.707411    
## MasVnrTypeBrkFace     8.651e+03  4.308e+03   2.008 0.044764 *  
## MasVnrTypeNone        1.170e+04  4.333e+03   2.700 0.006999 ** 
## MasVnrTypeStone       1.263e+04  4.524e+03   2.792 0.005306 ** 
## MasVnrArea            1.745e+01  3.429e+00   5.087 4.06e-07 ***
## ExterQualFa           1.738e+04  6.660e+03   2.609 0.009159 ** 
## ExterQualGd          -7.165e+03  3.155e+03  -2.271 0.023267 *  
## ExterQualTA          -8.687e+03  3.467e+03  -2.505 0.012327 *  
## ExterCondFa          -1.164e+03  7.480e+03  -0.156 0.876357    
## ExterCondGd          -8.352e+03  6.786e+03  -1.231 0.218571    
## ExterCondTA          -5.100e+03  6.766e+03  -0.754 0.451071    
## FoundationCBlock      2.161e+03  1.876e+03   1.152 0.249554    
## FoundationPConc       6.421e+03  2.010e+03   3.195 0.001427 ** 
## FoundationStone       3.803e+03  8.048e+03   0.473 0.636598    
## FoundationWood       -2.113e+04  1.190e+04  -1.775 0.076083 .  
## BsmtQualFa           -6.816e+03  3.493e+03  -1.951 0.051208 .  
## BsmtQualGd           -8.203e+03  1.947e+03  -4.212 2.67e-05 ***
## BsmtQualTA           -6.553e+03  2.450e+03  -2.675 0.007559 ** 
## BsmtCondGd           -3.628e+02  2.975e+03  -0.122 0.902960    
## BsmtCondPo           -3.526e+03  1.677e+04  -0.210 0.833552    
## BsmtCondTA            2.853e+03  2.357e+03   1.210 0.226316    
## BsmtExposureGd        1.760e+03  1.696e+03   1.038 0.299573    
## BsmtExposureMn       -2.947e+03  1.743e+03  -1.691 0.091072 .  
## BsmtExposureNo       -3.732e+03  1.315e+03  -2.837 0.004611 ** 
## BsmtFinType1BLQ      -4.615e+02  1.681e+03  -0.275 0.783701    
## BsmtFinType1GLQ       1.793e+03  1.494e+03   1.200 0.230358    
## BsmtFinType1LwQ      -2.067e+03  2.124e+03  -0.973 0.330670    
## BsmtFinType1Rec      -2.125e+03  1.661e+03  -1.280 0.200876    
## BsmtFinType1Unf      -6.962e+02  1.704e+03  -0.409 0.682843    
## BsmtFinSF1            3.275e+01  2.902e+00  11.285  < 2e-16 ***
## BsmtFinType2BLQ      -4.968e+03  4.278e+03  -1.161 0.245695    
## BsmtFinType2GLQ      -4.276e+03  4.811e+03  -0.889 0.374282    
## BsmtFinType2LwQ      -6.677e+03  4.074e+03  -1.639 0.101460    
## BsmtFinType2Rec      -5.203e+03  3.930e+03  -1.324 0.185711    
## BsmtFinType2Unf      -1.494e+03  4.173e+03  -0.358 0.720389    
## BsmtFinSF2            2.707e+01  5.120e+00   5.287 1.41e-07 ***
## BsmtUnfSF             1.373e+01  2.658e+00   5.166 2.69e-07 ***
## TotalBsmtSF                  NA         NA      NA       NA    
## HeatingGasW          -8.469e+03  4.493e+03  -1.885 0.059598 .  
## HeatingGrav          -5.136e+03  8.868e+03  -0.579 0.562550    
## HeatingOthW          -2.774e+04  1.234e+04  -2.248 0.024710 *  
## HeatingQCFa          -1.811e+03  2.774e+03  -0.653 0.513827    
## HeatingQCGd          -3.584e+03  1.202e+03  -2.980 0.002923 ** 
## HeatingQCPo           1.080e+04  1.261e+04   0.857 0.391825    
## HeatingQCTA          -3.653e+03  1.216e+03  -3.004 0.002706 ** 
## CentralAirY           1.928e+02  2.224e+03   0.087 0.930912    
## ElectricalFuseF      -2.528e+03  3.758e+03  -0.673 0.501197    
## ElectricalFuseP      -9.087e+03  7.764e+03  -1.170 0.241990    
## ElectricalMix         1.082e+04  3.276e+04   0.330 0.741215    
## ElectricalSBrkr      -1.345e+03  1.796e+03  -0.749 0.454141    
## X1stFlrSF             5.305e+01  3.083e+00  17.207  < 2e-16 ***
## X2ndFlrSF             6.563e+01  3.139e+00  20.910  < 2e-16 ***
## LowQualFinSF          1.285e+01  1.117e+01   1.151 0.250101    
## GrLivArea                    NA         NA      NA       NA    
## BsmtFullBath          1.677e+03  1.168e+03   1.436 0.151213    
## BsmtHalfBath          4.271e+02  1.679e+03   0.254 0.799179    
## FullBath              3.899e+03  1.335e+03   2.921 0.003532 ** 
## HalfBath              1.827e+02  1.279e+03   0.143 0.886482    
## BedroomAbvGr         -3.131e+03  8.360e+02  -3.745 0.000187 ***
## KitchenAbvGr         -7.469e+03  4.300e+03  -1.737 0.082566 .  
## KitchenQualFa        -1.486e+04  3.748e+03  -3.963 7.72e-05 ***
## KitchenQualGd        -1.960e+04  2.175e+03  -9.015  < 2e-16 ***
## KitchenQualTA        -1.723e+04  2.413e+03  -7.142 1.38e-12 ***
## TotRmsAbvGrd          6.885e+02  5.737e+02   1.200 0.230333    
## FunctionalMaj2       -2.119e+03  1.192e+04  -0.178 0.858921    
## FunctionalMin1        3.818e+03  5.803e+03   0.658 0.510726    
## FunctionalMin2        5.728e+03  6.052e+03   0.946 0.344092    
## FunctionalMod        -5.051e+03  6.566e+03  -0.769 0.441784    
## FunctionalSev        -4.922e+04  1.859e+04  -2.648 0.008168 ** 
## FunctionalTyp         1.642e+04  5.151e+03   3.187 0.001466 ** 
## Fireplaces            4.245e+03  8.150e+02   5.209 2.15e-07 ***
## GarageCars            2.358e+03  1.311e+03   1.799 0.072180 .  
## GarageArea            1.766e+01  4.447e+00   3.972 7.43e-05 ***
## PavedDriveP          -3.387e+03  3.110e+03  -1.089 0.276296    
## PavedDriveY          -1.302e+03  2.025e+03  -0.643 0.520199    
## WoodDeckSF            1.327e+01  3.461e+00   3.836 0.000130 ***
## OpenPorchSF           1.465e+01  6.522e+00   2.246 0.024834 *  
## EnclosedPorch         3.246e+00  6.949e+00   0.467 0.640530    
## X3SsnPorch            5.795e+01  1.758e+01   3.297 0.000998 ***
## ScreenPorch           2.329e+01  7.240e+00   3.217 0.001321 ** 
## PoolArea              6.857e+01  1.018e+01   6.736 2.25e-11 ***
## MiscVal              -4.092e-01  6.633e-01  -0.617 0.537426    
## MoSold               -5.282e+02  1.438e+02  -3.672 0.000249 ***
## YrSold                2.363e+02  5.126e+02   0.461 0.644787    
## SaleTypeCon           3.746e+04  9.798e+03   3.823 0.000137 ***
## SaleTypeConLD         1.199e+04  5.332e+03   2.249 0.024674 *  
## SaleTypeConLI        -3.401e+03  1.008e+04  -0.337 0.735798    
## SaleTypeConLw        -4.058e+02  8.787e+03  -0.046 0.963174    
## SaleTypeCWD           2.101e+04  5.381e+03   3.904 9.86e-05 ***
## SaleTypeNew           1.231e+04  8.920e+03   1.380 0.167677    
## SaleTypeOth           9.717e+03  9.802e+03   0.991 0.321657    
## SaleTypeWD           -1.370e+03  2.571e+03  -0.533 0.594137    
## SaleConditionAdjLand  9.748e+03  6.362e+03   1.532 0.125645    
## SaleConditionAlloca   8.512e+03  6.135e+03   1.387 0.165492    
## SaleConditionFamily  -2.448e+02  3.260e+03  -0.075 0.940153    
## SaleConditionNormal   4.339e+03  1.736e+03   2.499 0.012568 *  
## SaleConditionPartial  7.276e+03  8.540e+03   0.852 0.394360    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15630 on 1616 degrees of freedom
##   (82 observations deleted due to missingness)
## Multiple R-squared:  0.9662, Adjusted R-squared:  0.9613 
## F-statistic: 197.4 on 234 and 1616 DF,  p-value: < 2.2e-16
# R-squared = 0.9662: The model explains 96.6% of the variance in house prices before 2009
# Adjusted R-squared = 0.9613
# F-statistic: Highly significant p-value: < 2.2e-16
# Looking at the above output, coefficients with P-values less than .05 and significance flags (*, **, ***) were identified. Physical and quality related variables showed very low p-values.

# Variables grouped by category (e.g. multiple neighborhood variables into one neighborhood category).

regBefore2009optimal = lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea +
                          TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
                          Fireplaces + LotArea + BsmtFinSF1 + KitchenQual +
                          Neighborhood + ExterQual + X1stFlrSF, data = before2009)
summary(regBefore2009optimal)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd + 
##     Fireplaces + LotArea + BsmtFinSF1 + KitchenQual + Neighborhood + 
##     ExterQual + X1stFlrSF, data = before2009)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -500606  -11301    1060   11433  209410 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -8.828e+05  1.262e+05  -6.997 3.64e-12 ***
## OverallQual.L        1.342e+05  1.723e+04   7.793 1.08e-14 ***
## OverallQual.Q        4.380e+04  1.606e+04   2.728 0.006429 ** 
## OverallQual.C        1.714e+04  1.387e+04   1.236 0.216709    
## OverallQual^4        1.015e+04  1.141e+04   0.889 0.374106    
## OverallQual^5       -7.806e+03  9.180e+03  -0.850 0.395290    
## OverallQual^6        4.627e+03  7.023e+03   0.659 0.510102    
## OverallQual^7        7.792e+02  4.986e+03   0.156 0.875825    
## OverallQual^8        1.298e+03  3.231e+03   0.402 0.687990    
## OverallQual^9        2.951e+02  1.860e+03   0.159 0.873982    
## GrLivArea            4.257e+01  2.238e+00  19.021  < 2e-16 ***
## GarageCars           8.710e+03  2.078e+03   4.190 2.91e-05 ***
## GarageArea           8.002e+00  7.148e+00   1.120 0.263057    
## TotalBsmtSF          5.077e+00  2.883e+00   1.761 0.078407 .  
## FullBath             1.492e+02  1.825e+03   0.082 0.934877    
## YearBuilt            2.502e+02  5.062e+01   4.943 8.40e-07 ***
## YearRemodAdd         2.464e+02  4.668e+01   5.278 1.46e-07 ***
## Fireplaces           7.716e+03  1.270e+03   6.078 1.48e-09 ***
## LotArea              4.237e-01  9.938e-02   4.263 2.12e-05 ***
## BsmtFinSF1           1.457e+01  1.788e+00   8.152 6.50e-16 ***
## KitchenQualFa       -2.481e+04  6.012e+03  -4.126 3.85e-05 ***
## KitchenQualGd       -2.356e+04  3.599e+03  -6.545 7.66e-11 ***
## KitchenQualTA       -2.650e+04  3.952e+03  -6.706 2.65e-11 ***
## NeighborhoodBlueste -2.440e+04  1.545e+04  -1.579 0.114407    
## NeighborhoodBrDale  -2.125e+04  9.298e+03  -2.285 0.022414 *  
## NeighborhoodBrkSide  6.000e+03  7.953e+03   0.754 0.450663    
## NeighborhoodClearCr  1.345e+04  8.601e+03   1.564 0.117931    
## NeighborhoodCollgCr  5.298e+03  6.731e+03   0.787 0.431293    
## NeighborhoodCrawfor  2.618e+04  7.662e+03   3.418 0.000645 ***
## NeighborhoodEdwards -1.292e+04  7.347e+03  -1.758 0.078946 .  
## NeighborhoodGilbert -4.907e+02  7.055e+03  -0.070 0.944556    
## NeighborhoodIDOTRR  -8.049e+03  8.110e+03  -0.992 0.321115    
## NeighborhoodMeadowV -2.180e+04  9.018e+03  -2.418 0.015711 *  
## NeighborhoodMitchel -8.006e+03  7.451e+03  -1.075 0.282725    
## NeighborhoodNAmes   -1.839e+03  7.067e+03  -0.260 0.794740    
## NeighborhoodNoRidge  4.933e+04  7.856e+03   6.280 4.21e-10 ***
## NeighborhoodNPkVill -7.464e+03  1.144e+04  -0.652 0.514223    
## NeighborhoodNridgHt  3.338e+04  7.219e+03   4.624 4.02e-06 ***
## NeighborhoodNWAmes  -3.209e+03  7.293e+03  -0.440 0.659995    
## NeighborhoodOldTown -6.940e+03  7.724e+03  -0.899 0.369023    
## NeighborhoodSawyer  -5.395e+03  7.391e+03  -0.730 0.465525    
## NeighborhoodSawyerW  2.589e+03  7.338e+03   0.353 0.724234    
## NeighborhoodSomerst  5.698e+03  6.933e+03   0.822 0.411276    
## NeighborhoodStoneBr  3.892e+04  8.103e+03   4.803 1.69e-06 ***
## NeighborhoodSWISU    1.887e+03  9.097e+03   0.207 0.835658    
## NeighborhoodTimber   1.110e+04  7.577e+03   1.466 0.142914    
## NeighborhoodVeenker  2.919e+04  9.011e+03   3.239 0.001221 ** 
## ExterQualFa          1.023e+03  9.546e+03   0.107 0.914659    
## ExterQualGd         -6.376e+03  5.145e+03  -1.239 0.215397    
## ExterQualTA         -6.487e+03  5.618e+03  -1.155 0.248307    
## X1stFlrSF            5.766e+00  3.327e+00   1.733 0.083238 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27640 on 1859 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.8833, Adjusted R-squared:  0.8801 
## F-statistic: 281.3 on 50 and 1859 DF,  p-value: < 2.2e-16
# Multiple R-squared: 0.8816, Adjusted R-squared: 0.8784, F-statistic: 273.8 on 49 and 1801 DF, p-value: < 2.2e-16.
# The predictors explained approximately 88%.
library(ggplot2)
library(ggfortify)
# Display diagnostic plots for the regression model
autoplot(regBefore2009optimal)
## Warning: `fortify(<lm>)` was deprecated in ggplot2 3.6.0.
## ℹ Please use `broom::augment(<lm>)` instead.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggfortify package.
##   Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 409 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Load the after2009 dataset
after2009 = read_csv("PricesAfter2009.csv")
## New names:
## Rows: 986 Columns: 82
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (42): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf... dbl
## (39): ...1, Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCo... lgl
## (1): PoolQC
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Convert MSSubClass, OverallQual, and OverallCond to factors (same as before2009)
after2009$MSSubClass = as.factor(after2009$MSSubClass)
after2009$OverallQual = as.factor(after2009$OverallQual)
after2009$OverallCond = as.factor(after2009$OverallCond)

# OverallQual and OverallCond are ordinal (rating scales)
after2009$OverallQual <- ordered(after2009$OverallQual)
after2009$OverallCond <- ordered(after2009$OverallCond)

# Drop columns (except SalePrice) that have more than 100 missing values
after2009 = after2009[, colSums(is.na(after2009)) <= 100 | colnames(after2009) == "SalePrice"]

# Drop Id and Utilities columns
after2009 = subset(after2009, select = -c(Id, Utilities))

# Print first 10 rows to verify
head(after2009, 10)
## # A tibble: 10 × 74
##     ...1 MSSubClass MSZoning LotArea Street LotShape LandContour LotConfig
##    <dbl> <fct>      <chr>      <dbl> <chr>  <chr>    <chr>       <chr>    
##  1     1 50         RL         14115 Pave   IR1      Lvl         Inside   
##  2     2 60         RL         10382 Pave   IR1      Lvl         Corner   
##  3     3 20         RL         11241 Pave   IR1      Lvl         CulDSac  
##  4     4 20         RL          7560 Pave   Reg      Lvl         Inside   
##  5     5 20         RL          8246 Pave   IR1      Lvl         Inside   
##  6     6 20         RL         14230 Pave   Reg      Lvl         Corner   
##  7     7 20         RL          7200 Pave   Reg      Lvl         Corner   
##  8     8 20         RL         11478 Pave   Reg      Lvl         Inside   
##  9     9 20         RL         10552 Pave   IR1      Lvl         Inside   
## 10    10 20         RL         10859 Pave   Reg      Lvl         Corner   
## # ℹ 66 more variables: LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>,
## #   Condition2 <chr>, BldgType <chr>, HouseStyle <chr>, OverallQual <ord>,
## #   OverallCond <ord>, YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>,
## #   RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## #   MasVnrArea <dbl>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## #   BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## #   BsmtFinSF1 <dbl>, BsmtFinType2 <chr>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, …
library(ggplot2)

# Create a density plot of SalePrice for all neighborhoods after 2009
# Used free_y to allow each neighborhood to have its own y-axis
# X axis is Sale Price, Y axis shows relative frequency via density

ggplot(after2009, aes(x = SalePrice)) +
  geom_density(fill = "red", alpha = 0.5, color = "black") +
  facet_wrap(~ Neighborhood, scales = "free_y") +
  scale_x_continuous(labels = scales::comma, breaks = seq(0, 500000, 100000)) +
  ggtitle("Density Plot of Sale Prices by Neighborhood (After 2009)") +
  xlab("Sale Price ($)") +
  ylab("Density") +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 6))
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).

# The majority of neighborhoods display smooth, single-peaked price distributions. Difficult to see any fraud activity at this level.
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
# Density plot for NAmes before 2009
pBefore2009NAmes = ggplot(before2009[before2009$Neighborhood == "NAmes", ], aes(x = SalePrice)) +
  geom_density(fill = "yellow", alpha = 0.5) +
  ggtitle("Density Plot NAmes Before 2009") +
  xlab("Sale Price") +
  xlim(0, 800000) +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Density plot for NAmes after 2009
pAfter2009NAmes = ggplot(after2009[after2009$Neighborhood == "NAmes", ], aes(x = SalePrice)) +
  geom_density(fill = "orange", alpha = 0.5) +
  ggtitle("Density Plot NAmes After 2009") +
  xlab("Sale Price") +
  xlim(0, 800000) +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Density plot for Gilbert before 2009
pBefore2009Gilbert = ggplot(before2009[before2009$Neighborhood == "Gilbert", ], aes(x = SalePrice)) +
  geom_density(fill = "lightgreen", alpha = 0.5) +
  ggtitle("Density Plot Gilbert Before 2009") +
  xlab("Sale Price") +
  xlim(0, 800000) +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Density plot for Gilbert after 2009
pAfter2009Gilbert = ggplot(after2009[after2009$Neighborhood == "Gilbert", ], aes(x = SalePrice)) +
  geom_density(fill = "lightblue", alpha = 0.5) +
  ggtitle("Density Plot Gilbert After 2009") +
  xlab("Sale Price") +
  xlim(0, 800000) +
  scale_x_continuous(labels = scales::comma) +
  theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
grid.arrange(pBefore2009NAmes, pAfter2009NAmes, pBefore2009Gilbert, pAfter2009Gilbert, nrow = 2)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

# NAmes shows no clear signs of pricing anomalies or fraud. Prices remained consistent before and after 2009, suggesting stable market behavior.
# Gilbert's after2009 curve suggests possible irregularities. The sharper peak and multiple bumps indicate sales concentrated at unusual price points, a pattern consistent with fraudulent activity clustering prices around artificial targets.
# Run multiple linear regression on after2009 using the same variables as before2009optimal
regAfter2009optimal = lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea +
                         TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
                         Fireplaces + LotArea + BsmtFinSF1 + KitchenQual +
                         Neighborhood + ExterQual + X1stFlrSF, data = after2009)

summary(regAfter2009optimal)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd + 
##     Fireplaces + LotArea + BsmtFinSF1 + KitchenQual + Neighborhood + 
##     ExterQual + X1stFlrSF, data = after2009)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -270665   -8685    1348   11161  205033 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.067e+06  1.877e+05  -5.687 1.73e-08 ***
## OverallQual.L        1.004e+05  1.948e+04   5.155 3.10e-07 ***
## OverallQual.Q        4.323e+04  1.781e+04   2.427  0.01542 *  
## OverallQual.C        1.526e+04  1.489e+04   1.025  0.30563    
## OverallQual^4        1.088e+04  1.198e+04   0.909  0.36384    
## OverallQual^5        1.379e+03  9.267e+03   0.149  0.88176    
## OverallQual^6        2.161e+03  7.357e+03   0.294  0.76906    
## OverallQual^7        1.263e+03  5.768e+03   0.219  0.82670    
## OverallQual^8       -8.692e+02  4.125e+03  -0.211  0.83317    
## OverallQual^9        1.147e+03  2.541e+03   0.451  0.65183    
## GrLivArea            4.609e+01  3.335e+00  13.822  < 2e-16 ***
## GarageCars           3.140e+03  2.955e+03   1.062  0.28829    
## GarageArea           1.906e+01  1.027e+01   1.856  0.06374 .  
## TotalBsmtSF          1.857e+01  4.019e+00   4.622 4.34e-06 ***
## FullBath             1.890e+03  2.526e+03   0.748  0.45463    
## YearBuilt            3.183e+02  7.503e+01   4.242 2.43e-05 ***
## YearRemodAdd         2.773e+02  6.573e+01   4.219 2.70e-05 ***
## Fireplaces           3.214e+03  1.764e+03   1.822  0.06882 .  
## LotArea              8.393e-01  1.295e-01   6.482 1.47e-10 ***
## BsmtFinSF1           2.552e+01  2.536e+00  10.063  < 2e-16 ***
## KitchenQualFa       -2.892e+04  8.816e+03  -3.281  0.00107 ** 
## KitchenQualGd       -2.508e+04  5.746e+03  -4.364 1.42e-05 ***
## KitchenQualTA       -2.845e+04  6.210e+03  -4.582 5.23e-06 ***
## NeighborhoodBlueste -2.578e+03  1.577e+04  -0.163  0.87020    
## NeighborhoodBrDale   1.149e+04  1.456e+04   0.789  0.43031    
## NeighborhoodBrkSide  2.711e+04  1.248e+04   2.172  0.03011 *  
## NeighborhoodClearCr  2.756e+04  1.345e+04   2.048  0.04080 *  
## NeighborhoodCollgCr  1.582e+04  1.082e+04   1.462  0.14405    
## NeighborhoodCrawfor  4.999e+04  1.219e+04   4.102 4.46e-05 ***
## NeighborhoodEdwards  9.523e+03  1.159e+04   0.822  0.41140    
## NeighborhoodGilbert  6.744e+03  1.107e+04   0.609  0.54254    
## NeighborhoodIDOTRR   6.987e+03  1.360e+04   0.514  0.60763    
## NeighborhoodMeadowV -3.260e+03  1.408e+04  -0.232  0.81693    
## NeighborhoodMitchel  9.587e+03  1.154e+04   0.831  0.40639    
## NeighborhoodNAmes    1.539e+04  1.121e+04   1.373  0.17012    
## NeighborhoodNoRidge  4.836e+04  1.233e+04   3.922 9.42e-05 ***
## NeighborhoodNPkVill  7.519e+03  1.315e+04   0.572  0.56775    
## NeighborhoodNridgHt  2.118e+04  1.123e+04   1.885  0.05977 .  
## NeighborhoodNWAmes   1.102e+04  1.149e+04   0.959  0.33788    
## NeighborhoodOldTown  1.001e+04  1.226e+04   0.816  0.41447    
## NeighborhoodSawyer   1.102e+04  1.169e+04   0.943  0.34613    
## NeighborhoodSawyerW  1.350e+04  1.104e+04   1.224  0.22142    
## NeighborhoodSomerst  1.333e+04  1.097e+04   1.215  0.22452    
## NeighborhoodStoneBr  5.216e+04  1.319e+04   3.955 8.23e-05 ***
## NeighborhoodSWISU    1.269e+04  1.324e+04   0.958  0.33827    
## NeighborhoodTimber   1.701e+04  1.224e+04   1.390  0.16495    
## NeighborhoodVeenker  3.652e+04  1.791e+04   2.039  0.04176 *  
## ExterQualFa         -3.475e+04  1.147e+04  -3.030  0.00251 ** 
## ExterQualGd         -3.252e+04  7.601e+03  -4.279 2.07e-05 ***
## ExterQualTA         -3.656e+04  8.230e+03  -4.442 9.97e-06 ***
## X1stFlrSF           -1.233e+01  4.879e+00  -2.526  0.01169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28230 on 930 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8691, Adjusted R-squared:  0.8621 
## F-statistic: 123.5 on 50 and 930 DF,  p-value: < 2.2e-16
library(ggfortify)
autoplot(regAfter2009optimal)
## Warning: Removed 263 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The Residuals vs Fitted graph and Scale-Location graph show a clear deviation indicating possible fraud.
ggplot(data = regAfter2009optimal, aes(x = .fitted, y = .resid)) +
  geom_point(color = "blue", alpha = 0.6) +
  stat_smooth(method = "loess", color = "red", se = FALSE) +
  ggtitle("Residuals vs Fitted Values (After 2009)") +
  xlab("Fitted Values") +
  ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'

# The Residuals vs Fitted plot shows a line of outliers deviating towards the bottom right from the main cluster. These points represent potential outliers with unusually high or low residuals, indicating properties with sale prices much higher or lower than expected, suggesting possible data errors or fraudulent activity.
# Method: Use the before2009 model to generate realistic sale prices for flagged rows.
# Step 1: Identify rows in after2009 where old fraud occurred (SalePrice == 142769.7).
# Step 2: Use regBefore2009optimal to predict what those houses should have sold for based on their features.
# Step 3: Add small random variation (about +/-1%) to make prices look natural.
# Step 4: Replace only the flagged SalePrice values with the new model-based values.
# Rationale: Using the model ensures each replaced value makes sense for its specific house. Adding random noise simulates natural market fluctuations so values do not appear computer-generated.
# Density plots for NAmes and Gilbert before fraud fix.
# The red dashed line marks the artificially repeated price of $142,769.70.
# In Gilbert, the clustering produces a visible bump near the red line, suggesting tampering.

pFraudNAmes <- ggplot(after2009 %>% filter(Neighborhood == "NAmes"),
                      aes(x = SalePrice)) +
  geom_density(fill = "yellow", alpha = 0.5, color = "black") +
  geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
  ggtitle("NAmes - Before Fraud Fix (Showing Fraud Spike)") +
  xlab("Sale Price") +
  ylab("Density") +
  scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
  theme_minimal()

pFraudGilbert <- ggplot(after2009 %>% filter(Neighborhood == "Gilbert"),
                        aes(x = SalePrice)) +
  geom_density(fill = "lightgreen", alpha = 0.5, color = "black") +
  geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
  ggtitle("Gilbert - Before Fraud Fix (Showing Fraud Spike)") +
  xlab("Sale Price") +
  ylab("Density") +
  scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
  theme_minimal()

grid.arrange(pFraudNAmes, pFraudGilbert, nrow = 1)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

library(dplyr)
library(ggplot2)
library(gridExtra)
library(ggfortify)

# Find which rows have the suspicious SalePrice
flagged_rows_before2009 = which(before2009$SalePrice == 142769.7)
flagged_rows_after2009 = which(after2009$SalePrice == 142769.7)

flagged_rows_before2009
## integer(0)
flagged_rows_after2009
##  [1] 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
## [20] 533 534 535 536 537 538 539 540 541 542 543
after2009[flagged_rows_after2009[1:30], c("Neighborhood", "SalePrice")]
## # A tibble: 30 × 2
##    Neighborhood SalePrice
##    <chr>            <dbl>
##  1 NAmes          142770.
##  2 NAmes          142770.
##  3 Gilbert        142770.
##  4 Gilbert        142770.
##  5 StoneBr        142770.
##  6 Gilbert        142770.
##  7 Gilbert        142770.
##  8 Gilbert        142770.
##  9 Gilbert        142770.
## 10 NAmes          142770.
## # ℹ 20 more rows
# Use regBefore2009optimal to predict realistic prices for the flagged rows
predicted_prices = predict(regBefore2009optimal,
                           newdata = after2009[flagged_rows_after2009, ])
data.frame(Row = 1:length(predicted_prices), Predicted_Price = predicted_prices)
##    Row Predicted_Price
## 1    1       113982.88
## 2    2       149850.86
## 3    3       185304.98
## 4    4       189740.28
## 5    5       237263.80
## 6    6       175960.88
## 7    7       168389.62
## 8    8       169338.74
## 9    9       191716.01
## 10  10       121195.49
## 11  11       199681.93
## 12  12        93329.86
## 13  13        93989.93
## 14  14       159492.99
## 15  15       116950.86
## 16  16       364909.75
## 17  17       265628.27
## 18  18       331055.77
## 19  19       309175.17
## 20  20       419873.82
## 21  21       308994.86
## 22  22       248445.75
## 23  23       172523.66
## 24  24       169879.98
## 25  25       188182.62
## 26  26       199920.35
## 27  27       326907.18
## 28  28       244842.21
## 29  29       185438.69
## 30  30       212098.56
summary(predicted_prices)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   93330  168627  188961  210469  247545  419874
fraud_neighborhoods = after2009[flagged_rows_after2009, c("Neighborhood", "GrLivArea", "OverallQual", "SalePrice")]
head(fraud_neighborhoods, 10)
## # A tibble: 10 × 4
##    Neighborhood GrLivArea OverallQual SalePrice
##    <chr>            <dbl> <ord>           <dbl>
##  1 NAmes              896 5             142770.
##  2 NAmes             1329 6             142770.
##  3 Gilbert           1629 5             142770.
##  4 Gilbert           1604 6             142770.
##  5 StoneBr           1280 8             142770.
##  6 Gilbert           1655 6             142770.
##  7 Gilbert           1187 6             142770.
##  8 Gilbert           1465 6             142770.
##  9 Gilbert           1341 7             142770.
## 10 NAmes              882 4             142770.
table(fraud_neighborhoods$Neighborhood)
## 
## Blmngtn  BrDale Gilbert   NAmes NoRidge NPkVill NridgHt Somerst StoneBr 
##       1       2       9       4       1       2       7       3       1
after2009_adjusted = after2009
# Set seed for reproducibility. Add small random noise (~1% SD) around predicted prices to simulate natural market variation.
set.seed(123)
noise = rnorm(length(predicted_prices),
              mean = 0,
              sd = predicted_prices * 0.01)
new_prices = predicted_prices + noise
after2009_adjusted = after2009
after2009_adjusted$SalePrice[flagged_rows_after2009] = new_prices
# Compare density plots before and after price adjustment

pBeforeNAmes <- ggplot(after2009 %>% filter(Neighborhood == "NAmes"),
                       aes(x = SalePrice)) +
  geom_density(fill = "yellow", alpha = 0.5, color = "black") +
  geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
  ggtitle("NAmes - Before Fraud Fix (Spike @142,769)") +
  xlab("Sale Price") + ylab("Density") +
  scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
  theme_minimal()

pAfterNAmes <- ggplot(after2009_adjusted %>% filter(Neighborhood == "NAmes"),
                      aes(x = SalePrice)) +
  geom_density(fill = "orange", alpha = 0.5, color = "black") +
  ggtitle("NAmes - After Fraud Fix (Smoothed)") +
  xlab("Sale Price") + ylab("Density") +
  scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
  theme_minimal()

pBeforeGilbert <- ggplot(after2009 %>% filter(Neighborhood == "Gilbert"),
                         aes(x = SalePrice)) +
  geom_density(fill = "lightgreen", alpha = 0.5, color = "black") +
  geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
  ggtitle("Gilbert - Before Fraud Fix (Spike @142,769)") +
  xlab("Sale Price") + ylab("Density") +
  scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
  theme_minimal()

pAfterGilbert <- ggplot(after2009_adjusted %>% filter(Neighborhood == "Gilbert"),
                        aes(x = SalePrice)) +
  geom_density(fill = "lightblue", alpha = 0.5, color = "black") +
  ggtitle("Gilbert - After Fraud Fix (Smoothed)") +
  xlab("Sale Price") + ylab("Density") +
  scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
  theme_minimal()

grid.arrange(pBeforeNAmes, pAfterNAmes, pBeforeGilbert, pAfterGilbert, nrow = 2)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

# After applying the fraud fix, the spike at $142,769.70 disappears in both neighborhoods and the distribution becomes smooth, confirming the correction blended well with legitimate market data.
library(broom)
library(ggplot2)
regAfter2009_adjusted = lm(formula(regAfter2009optimal), data = after2009_adjusted)

# augment combines predicted values, residuals, and influence statistics for outlier visualization
ggplot(augment(regAfter2009_adjusted),
       aes(x = .fitted, y = .resid)) +
  geom_point(alpha = 0.6, color = "blue") +
  stat_smooth(method = "loess", color = "red", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# The Residuals vs Fitted plot confirms the fraud fix was effective. Residuals are now evenly scattered around zero with no visible clustering.
regAfter2009optimalFraud = lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea +
                              TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
                              Fireplaces + LotArea + BsmtFinSF1 + KitchenQual +
                              Neighborhood + ExterQual + X1stFlrSF, data = after2009)

summary(regAfter2009optimalFraud)
## 
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars + 
##     GarageArea + TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd + 
##     Fireplaces + LotArea + BsmtFinSF1 + KitchenQual + Neighborhood + 
##     ExterQual + X1stFlrSF, data = after2009)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -270665   -8685    1348   11161  205033 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -1.067e+06  1.877e+05  -5.687 1.73e-08 ***
## OverallQual.L        1.004e+05  1.948e+04   5.155 3.10e-07 ***
## OverallQual.Q        4.323e+04  1.781e+04   2.427  0.01542 *  
## OverallQual.C        1.526e+04  1.489e+04   1.025  0.30563    
## OverallQual^4        1.088e+04  1.198e+04   0.909  0.36384    
## OverallQual^5        1.379e+03  9.267e+03   0.149  0.88176    
## OverallQual^6        2.161e+03  7.357e+03   0.294  0.76906    
## OverallQual^7        1.263e+03  5.768e+03   0.219  0.82670    
## OverallQual^8       -8.692e+02  4.125e+03  -0.211  0.83317    
## OverallQual^9        1.147e+03  2.541e+03   0.451  0.65183    
## GrLivArea            4.609e+01  3.335e+00  13.822  < 2e-16 ***
## GarageCars           3.140e+03  2.955e+03   1.062  0.28829    
## GarageArea           1.906e+01  1.027e+01   1.856  0.06374 .  
## TotalBsmtSF          1.857e+01  4.019e+00   4.622 4.34e-06 ***
## FullBath             1.890e+03  2.526e+03   0.748  0.45463    
## YearBuilt            3.183e+02  7.503e+01   4.242 2.43e-05 ***
## YearRemodAdd         2.773e+02  6.573e+01   4.219 2.70e-05 ***
## Fireplaces           3.214e+03  1.764e+03   1.822  0.06882 .  
## LotArea              8.393e-01  1.295e-01   6.482 1.47e-10 ***
## BsmtFinSF1           2.552e+01  2.536e+00  10.063  < 2e-16 ***
## KitchenQualFa       -2.892e+04  8.816e+03  -3.281  0.00107 ** 
## KitchenQualGd       -2.508e+04  5.746e+03  -4.364 1.42e-05 ***
## KitchenQualTA       -2.845e+04  6.210e+03  -4.582 5.23e-06 ***
## NeighborhoodBlueste -2.578e+03  1.577e+04  -0.163  0.87020    
## NeighborhoodBrDale   1.149e+04  1.456e+04   0.789  0.43031    
## NeighborhoodBrkSide  2.711e+04  1.248e+04   2.172  0.03011 *  
## NeighborhoodClearCr  2.756e+04  1.345e+04   2.048  0.04080 *  
## NeighborhoodCollgCr  1.582e+04  1.082e+04   1.462  0.14405    
## NeighborhoodCrawfor  4.999e+04  1.219e+04   4.102 4.46e-05 ***
## NeighborhoodEdwards  9.523e+03  1.159e+04   0.822  0.41140    
## NeighborhoodGilbert  6.744e+03  1.107e+04   0.609  0.54254    
## NeighborhoodIDOTRR   6.987e+03  1.360e+04   0.514  0.60763    
## NeighborhoodMeadowV -3.260e+03  1.408e+04  -0.232  0.81693    
## NeighborhoodMitchel  9.587e+03  1.154e+04   0.831  0.40639    
## NeighborhoodNAmes    1.539e+04  1.121e+04   1.373  0.17012    
## NeighborhoodNoRidge  4.836e+04  1.233e+04   3.922 9.42e-05 ***
## NeighborhoodNPkVill  7.519e+03  1.315e+04   0.572  0.56775    
## NeighborhoodNridgHt  2.118e+04  1.123e+04   1.885  0.05977 .  
## NeighborhoodNWAmes   1.102e+04  1.149e+04   0.959  0.33788    
## NeighborhoodOldTown  1.001e+04  1.226e+04   0.816  0.41447    
## NeighborhoodSawyer   1.102e+04  1.169e+04   0.943  0.34613    
## NeighborhoodSawyerW  1.350e+04  1.104e+04   1.224  0.22142    
## NeighborhoodSomerst  1.333e+04  1.097e+04   1.215  0.22452    
## NeighborhoodStoneBr  5.216e+04  1.319e+04   3.955 8.23e-05 ***
## NeighborhoodSWISU    1.269e+04  1.324e+04   0.958  0.33827    
## NeighborhoodTimber   1.701e+04  1.224e+04   1.390  0.16495    
## NeighborhoodVeenker  3.652e+04  1.791e+04   2.039  0.04176 *  
## ExterQualFa         -3.475e+04  1.147e+04  -3.030  0.00251 ** 
## ExterQualGd         -3.252e+04  7.601e+03  -4.279 2.07e-05 ***
## ExterQualTA         -3.656e+04  8.230e+03  -4.442 9.97e-06 ***
## X1stFlrSF           -1.233e+01  4.879e+00  -2.526  0.01169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28230 on 930 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8691, Adjusted R-squared:  0.8621 
## F-statistic: 123.5 on 50 and 930 DF,  p-value: < 2.2e-16
library(ggfortify)
autoplot(regAfter2009_adjusted)
## Warning: Removed 273 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

# In the diagnostic plots for regAfter2009_adjusted, a few points lie far from the main trend line in the Residuals vs Fitted and QQ plots. These represent potential outliers with sale prices that differ greatly from model predictions, possibly due to unusual property characteristics or remaining fraudulent reporting.