# Reading in the data and taking a glimpse of data before manipulation. Summary shows categories with heavy NAs
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
before2009 = read_csv("PricesBefore2009.csv")
## New names:
## Rows: 1933 Columns: 82
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf... dbl
## (39): ...1, Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCo...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
str("PricesBefore2009")
## chr "PricesBefore2009"
head(before2009)
## # A tibble: 6 × 82
## ...1 Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <chr> <chr> <chr>
## 1 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 7 20 RL 75 10084 Pave <NA> Reg
## # ℹ 73 more variables: LandContour <chr>, Utilities <chr>, LotConfig <chr>,
## # LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>, Condition2 <chr>,
## # BldgType <chr>, HouseStyle <chr>, OverallQual <dbl>, OverallCond <dbl>,
## # YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## # Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>, MasVnrArea <dbl>,
## # ExterQual <chr>, ExterCond <chr>, Foundation <chr>, BsmtQual <chr>,
## # BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, …
summary(before2009)
## ...1 Id MSSubClass MSZoning
## Min. : 1 Min. : 1 Min. : 20.00 Length:1933
## 1st Qu.: 484 1st Qu.: 733 1st Qu.: 20.00 Class :character
## Median : 967 Median :1953 Median : 50.00 Mode :character
## Mean : 967 Mean :1594 Mean : 57.81
## 3rd Qu.:1450 3rd Qu.:2436 3rd Qu.: 70.00
## Max. :1933 Max. :2919 Max. :190.00
##
## LotFrontage LotArea Street Alley
## Min. : 21.00 Min. : 1470 Length:1933 Length:1933
## 1st Qu.: 58.00 1st Qu.: 7446 Class :character Class :character
## Median : 68.00 Median : 9462 Mode :character Mode :character
## Mean : 69.38 Mean : 10306
## 3rd Qu.: 80.00 3rd Qu.: 11643
## Max. :313.00 Max. :164660
## NA's :317
## LotShape LandContour Utilities LotConfig
## Length:1933 Length:1933 Length:1933 Length:1933
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## LandSlope Neighborhood Condition1 Condition2
## Length:1933 Length:1933 Length:1933 Length:1933
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BldgType HouseStyle OverallQual OverallCond
## Length:1933 Length:1933 Min. : 1.000 Min. :1.00
## Class :character Class :character 1st Qu.: 5.000 1st Qu.:5.00
## Mode :character Mode :character Median : 6.000 Median :5.00
## Mean : 6.108 Mean :5.57
## 3rd Qu.: 7.000 3rd Qu.:6.00
## Max. :10.000 Max. :9.00
##
## YearBuilt YearRemodAdd RoofStyle RoofMatl
## Min. :1872 Min. :1950 Length:1933 Length:1933
## 1st Qu.:1953 1st Qu.:1965 Class :character Class :character
## Median :1972 Median :1993 Mode :character Mode :character
## Mean :1971 Mean :1984
## 3rd Qu.:2001 3rd Qu.:2004
## Max. :2008 Max. :2009
##
## Exterior1st Exterior2nd MasVnrType MasVnrArea
## Length:1933 Length:1933 Length:1933 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 0.0
## Mode :character Mode :character Mode :character Median : 0.0
## Mean : 105.2
## 3rd Qu.: 168.0
## Max. :1600.0
## NA's :18
## ExterQual ExterCond Foundation BsmtQual
## Length:1933 Length:1933 Length:1933 Length:1933
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## Length:1933 Length:1933 Length:1933 Min. : 0.0
## Class :character Class :character Class :character 1st Qu.: 0.0
## Mode :character Mode :character Mode :character Median : 360.5
## Mean : 435.9
## 3rd Qu.: 733.0
## Max. :5644.0
## NA's :1
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## Length:1933 Min. : 0.00 Min. : 0.0 Min. : 0.0
## Class :character 1st Qu.: 0.00 1st Qu.: 226.0 1st Qu.: 796.8
## Mode :character Median : 0.00 Median : 476.5 Median : 988.0
## Mean : 48.96 Mean : 569.1 Mean :1053.9
## 3rd Qu.: 0.00 3rd Qu.: 816.0 3rd Qu.:1296.0
## Max. :1474.00 Max. :2153.0 Max. :6110.0
## NA's :1 NA's :1 NA's :1
## Heating HeatingQC CentralAir Electrical
## Length:1933 Length:1933 Length:1933 Length:1933
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 334 Min. : 0 Min. : 0.0 Min. : 334 Min. :0.0000
## 1st Qu.: 886 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:1118 1st Qu.:0.0000
## Median :1084 Median : 0 Median : 0.0 Median :1440 Median :0.0000
## Mean :1161 Mean : 342 Mean : 4.3 Mean :1507 Mean :0.4138
## 3rd Qu.:1383 3rd Qu.: 720 3rd Qu.: 0.0 3rd Qu.:1760 3rd Qu.:1.0000
## Max. :5095 Max. :2065 Max. :697.0 Max. :5642 Max. :2.0000
## NA's :2
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.06525 Mean :1.568 Mean :0.3787 Mean :2.872
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
## NA's :2
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1933 Min. : 2.000 Length:1933
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.039 Mean : 6.474
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :2.000 Max. :15.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.0000 Length:1933 Length:1933 Min. :1895
## 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1960
## Median :1.0000 Mode :character Mode :character Median :1979
## Mean :0.5934 Mean :1978
## 3rd Qu.:1.0000 3rd Qu.:2002
## Max. :4.0000 Max. :2207
## NA's :107
## GarageFinish GarageCars GarageArea GarageQual
## Length:1933 Min. :0.00 Min. : 0.0 Length:1933
## Class :character 1st Qu.:1.00 1st Qu.: 318.8 Class :character
## Mode :character Median :2.00 Median : 478.0 Mode :character
## Mean :1.77 Mean : 472.9
## 3rd Qu.:2.00 3rd Qu.: 576.0
## Max. :4.00 Max. :1488.0
## NA's :1 NA's :1
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1933 Length:1933 Min. : 0.0 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.0 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.0 Median : 28.00
## Mean : 92.5 Mean : 49.12
## 3rd Qu.: 168.0 3rd Qu.: 72.00
## Max. :1424.0 Max. :742.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.0
## Mean : 23.04 Mean : 2.259 Mean : 16.32 Mean : 3.4
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.0
## Max. :1012.00 Max. :407.000 Max. :576.00 Max. :800.0
##
## PoolQC Fence MiscFeature MiscVal
## Length:1933 Length:1933 Length:1933 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 52.77
## 3rd Qu.: 0.00
## Max. :17000.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1933 Length:1933
## 1st Qu.: 5.000 1st Qu.:2006 Class :character Class :character
## Median : 6.000 Median :2007 Mode :character Mode :character
## Mean : 6.426 Mean :2007
## 3rd Qu.: 8.000 3rd Qu.:2008
## Max. :12.000 Max. :2008
##
## SalePrice
## Min. : 454.5
## 1st Qu.:129000.0
## Median :163611.2
## Mean :181127.7
## 3rd Qu.:214993.0
## Max. :755000.0
## NA's :23
# Convert MSSubClass, OverallQual, and OverallCond to factors since they are currently categories and not numerical order
before2009$MSSubClass = as.factor(before2009$MSSubClass)
before2009$OverallQual = as.factor(before2009$OverallQual)
before2009$OverallCond = as.factor(before2009$OverallCond)
# OverallQual and OverallCond are ordinal (rating scales)
before2009$OverallQual <- ordered(before2009$OverallQual)
before2009$OverallCond <- ordered(before2009$OverallCond)
# Check structure to verify columns are now factors
str(before2009)
## spc_tbl_ [1,933 × 82] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ...1 : num [1:1933] 1 2 3 4 5 6 7 8 9 10 ...
## $ Id : num [1:1933] 1 2 3 4 5 7 9 10 11 12 ...
## $ MSSubClass : Factor w/ 16 levels "20","30","40",..: 6 1 6 7 6 1 5 16 1 6 ...
## $ MSZoning : chr [1:1933] "RL" "RL" "RL" "RL" ...
## $ LotFrontage : num [1:1933] 65 80 68 60 84 75 51 50 70 85 ...
## $ LotArea : num [1:1933] 8450 9600 11250 9550 14260 ...
## $ Street : chr [1:1933] "Pave" "Pave" "Pave" "Pave" ...
## $ Alley : chr [1:1933] NA NA NA NA ...
## $ LotShape : chr [1:1933] "Reg" "Reg" "IR1" "IR1" ...
## $ LandContour : chr [1:1933] "Lvl" "Lvl" "Lvl" "Lvl" ...
## $ Utilities : chr [1:1933] "AllPub" "AllPub" "AllPub" "AllPub" ...
## $ LotConfig : chr [1:1933] "Inside" "FR2" "Inside" "Corner" ...
## $ LandSlope : chr [1:1933] "Gtl" "Gtl" "Gtl" "Gtl" ...
## $ Neighborhood : chr [1:1933] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
## $ Condition1 : chr [1:1933] "Norm" "Feedr" "Norm" "Norm" ...
## $ Condition2 : chr [1:1933] "Norm" "Norm" "Norm" "Norm" ...
## $ BldgType : chr [1:1933] "1Fam" "1Fam" "1Fam" "1Fam" ...
## $ HouseStyle : chr [1:1933] "2Story" "1Story" "2Story" "2Story" ...
## $ OverallQual : Ord.factor w/ 10 levels "1"<"2"<"3"<"4"<..: 7 6 7 7 8 8 7 5 5 9 ...
## $ OverallCond : Ord.factor w/ 9 levels "1"<"2"<"3"<"4"<..: 5 8 5 5 5 5 5 6 5 5 ...
## $ YearBuilt : num [1:1933] 2003 1976 2001 1915 2000 ...
## $ YearRemodAdd : num [1:1933] 2003 1976 2002 1970 2000 ...
## $ RoofStyle : chr [1:1933] "Gable" "Gable" "Gable" "Gable" ...
## $ RoofMatl : chr [1:1933] "CompShg" "CompShg" "CompShg" "CompShg" ...
## $ Exterior1st : chr [1:1933] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
## $ Exterior2nd : chr [1:1933] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
## $ MasVnrType : chr [1:1933] "BrkFace" "None" "BrkFace" "None" ...
## $ MasVnrArea : num [1:1933] 196 0 162 0 350 186 0 0 0 286 ...
## $ ExterQual : chr [1:1933] "Gd" "TA" "Gd" "TA" ...
## $ ExterCond : chr [1:1933] "TA" "TA" "TA" "TA" ...
## $ Foundation : chr [1:1933] "PConc" "CBlock" "PConc" "BrkTil" ...
## $ BsmtQual : chr [1:1933] "Gd" "Gd" "Gd" "TA" ...
## $ BsmtCond : chr [1:1933] "TA" "TA" "TA" "Gd" ...
## $ BsmtExposure : chr [1:1933] "No" "Gd" "Mn" "No" ...
## $ BsmtFinType1 : chr [1:1933] "GLQ" "ALQ" "GLQ" "ALQ" ...
## $ BsmtFinSF1 : num [1:1933] 706 978 486 216 655 ...
## $ BsmtFinType2 : chr [1:1933] "Unf" "Unf" "Unf" "Unf" ...
## $ BsmtFinSF2 : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
## $ BsmtUnfSF : num [1:1933] 150 284 434 540 490 317 952 140 134 177 ...
## $ TotalBsmtSF : num [1:1933] 856 1262 920 756 1145 ...
## $ Heating : chr [1:1933] "GasA" "GasA" "GasA" "GasA" ...
## $ HeatingQC : chr [1:1933] "Ex" "Ex" "Ex" "Gd" ...
## $ CentralAir : chr [1:1933] "Y" "Y" "Y" "Y" ...
## $ Electrical : chr [1:1933] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
## $ X1stFlrSF : num [1:1933] 856 1262 920 961 1145 ...
## $ X2ndFlrSF : num [1:1933] 854 0 866 756 1053 ...
## $ LowQualFinSF : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
## $ GrLivArea : num [1:1933] 1710 1262 1786 1717 2198 ...
## $ BsmtFullBath : num [1:1933] 1 0 1 1 1 1 0 1 1 1 ...
## $ BsmtHalfBath : num [1:1933] 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : num [1:1933] 2 2 2 1 2 2 2 1 1 3 ...
## $ HalfBath : num [1:1933] 1 0 1 0 1 0 0 0 0 0 ...
## $ BedroomAbvGr : num [1:1933] 3 3 3 3 4 3 2 2 3 4 ...
## $ KitchenAbvGr : num [1:1933] 1 1 1 1 1 1 2 2 1 1 ...
## $ KitchenQual : chr [1:1933] "Gd" "TA" "Gd" "Gd" ...
## $ TotRmsAbvGrd : num [1:1933] 8 6 6 7 9 7 8 5 5 11 ...
## $ Functional : chr [1:1933] "Typ" "Typ" "Typ" "Typ" ...
## $ Fireplaces : num [1:1933] 0 1 1 1 1 1 2 2 0 2 ...
## $ FireplaceQu : chr [1:1933] NA "TA" "TA" "Gd" ...
## $ GarageType : chr [1:1933] "Attchd" "Attchd" "Attchd" "Detchd" ...
## $ GarageYrBlt : num [1:1933] 2003 1976 2001 1998 2000 ...
## $ GarageFinish : chr [1:1933] "RFn" "RFn" "RFn" "Unf" ...
## $ GarageCars : num [1:1933] 2 2 2 3 3 2 2 1 1 3 ...
## $ GarageArea : num [1:1933] 548 460 608 642 836 636 468 205 384 736 ...
## $ GarageQual : chr [1:1933] "TA" "TA" "TA" "TA" ...
## $ GarageCond : chr [1:1933] "TA" "TA" "TA" "TA" ...
## $ PavedDrive : chr [1:1933] "Y" "Y" "Y" "Y" ...
## $ WoodDeckSF : num [1:1933] 0 298 0 0 192 255 90 0 0 147 ...
## $ OpenPorchSF : num [1:1933] 61 0 42 35 84 57 0 4 0 21 ...
## $ EnclosedPorch: num [1:1933] 0 0 0 272 0 0 205 0 0 0 ...
## $ X3SsnPorch : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
## $ ScreenPorch : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolArea : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
## $ PoolQC : chr [1:1933] NA NA NA NA ...
## $ Fence : chr [1:1933] NA NA NA NA ...
## $ MiscFeature : chr [1:1933] NA NA NA NA ...
## $ MiscVal : num [1:1933] 0 0 0 0 0 0 0 0 0 0 ...
## $ MoSold : num [1:1933] 2 5 9 2 12 8 4 1 2 7 ...
## $ YrSold : num [1:1933] 2008 2007 2008 2006 2008 ...
## $ SaleType : chr [1:1933] "WD" "WD" "WD" "WD" ...
## $ SaleCondition: chr [1:1933] "Normal" "Normal" "Normal" "Abnorml" ...
## $ SalePrice : num [1:1933] 208500 181500 223500 140000 250000 ...
## - attr(*, "spec")=
## .. cols(
## .. ...1 = col_double(),
## .. Id = col_double(),
## .. MSSubClass = col_double(),
## .. MSZoning = col_character(),
## .. LotFrontage = col_double(),
## .. LotArea = col_double(),
## .. Street = col_character(),
## .. Alley = col_character(),
## .. LotShape = col_character(),
## .. LandContour = col_character(),
## .. Utilities = col_character(),
## .. LotConfig = col_character(),
## .. LandSlope = col_character(),
## .. Neighborhood = col_character(),
## .. Condition1 = col_character(),
## .. Condition2 = col_character(),
## .. BldgType = col_character(),
## .. HouseStyle = col_character(),
## .. OverallQual = col_double(),
## .. OverallCond = col_double(),
## .. YearBuilt = col_double(),
## .. YearRemodAdd = col_double(),
## .. RoofStyle = col_character(),
## .. RoofMatl = col_character(),
## .. Exterior1st = col_character(),
## .. Exterior2nd = col_character(),
## .. MasVnrType = col_character(),
## .. MasVnrArea = col_double(),
## .. ExterQual = col_character(),
## .. ExterCond = col_character(),
## .. Foundation = col_character(),
## .. BsmtQual = col_character(),
## .. BsmtCond = col_character(),
## .. BsmtExposure = col_character(),
## .. BsmtFinType1 = col_character(),
## .. BsmtFinSF1 = col_double(),
## .. BsmtFinType2 = col_character(),
## .. BsmtFinSF2 = col_double(),
## .. BsmtUnfSF = col_double(),
## .. TotalBsmtSF = col_double(),
## .. Heating = col_character(),
## .. HeatingQC = col_character(),
## .. CentralAir = col_character(),
## .. Electrical = col_character(),
## .. X1stFlrSF = col_double(),
## .. X2ndFlrSF = col_double(),
## .. LowQualFinSF = col_double(),
## .. GrLivArea = col_double(),
## .. BsmtFullBath = col_double(),
## .. BsmtHalfBath = col_double(),
## .. FullBath = col_double(),
## .. HalfBath = col_double(),
## .. BedroomAbvGr = col_double(),
## .. KitchenAbvGr = col_double(),
## .. KitchenQual = col_character(),
## .. TotRmsAbvGrd = col_double(),
## .. Functional = col_character(),
## .. Fireplaces = col_double(),
## .. FireplaceQu = col_character(),
## .. GarageType = col_character(),
## .. GarageYrBlt = col_double(),
## .. GarageFinish = col_character(),
## .. GarageCars = col_double(),
## .. GarageArea = col_double(),
## .. GarageQual = col_character(),
## .. GarageCond = col_character(),
## .. PavedDrive = col_character(),
## .. WoodDeckSF = col_double(),
## .. OpenPorchSF = col_double(),
## .. EnclosedPorch = col_double(),
## .. X3SsnPorch = col_double(),
## .. ScreenPorch = col_double(),
## .. PoolArea = col_double(),
## .. PoolQC = col_character(),
## .. Fence = col_character(),
## .. MiscFeature = col_character(),
## .. MiscVal = col_double(),
## .. MoSold = col_double(),
## .. YrSold = col_double(),
## .. SaleType = col_character(),
## .. SaleCondition = col_character(),
## .. SalePrice = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
# Compute number of NAs in each column
na_counts = colSums(is.na(before2009))
# Get column names
col_names = colnames(before2009)
# Combine two vectors side by side and NA counts into a dataframe called beforeNAs
beforeNAs = data.frame(Data_Fields = col_names, NAs = na_counts)
# View 10 rows
head(beforeNAs, 10)
## Data_Fields NAs
## ...1 ...1 0
## Id Id 0
## MSSubClass MSSubClass 0
## MSZoning MSZoning 3
## LotFrontage LotFrontage 317
## LotArea LotArea 0
## Street Street 0
## Alley Alley 1797
## LotShape LotShape 0
## LandContour LandContour 0
# Sort the NA counts in descending order to see which columns have the most missing data
beforeNAs = beforeNAs[order(-beforeNAs$NAs), ]
# Display the top 10 columns with the most missing values
head(beforeNAs, 10)
## Data_Fields NAs
## PoolQC PoolQC 1923
## MiscFeature MiscFeature 1871
## Alley Alley 1797
## Fence Fence 1569
## FireplaceQu FireplaceQu 938
## LotFrontage LotFrontage 317
## GarageYrBlt GarageYrBlt 107
## GarageFinish GarageFinish 107
## GarageQual GarageQual 107
## GarageCond GarageCond 107
# Drop columns (except SalePrice) that have more than 100 missing values
before2009 = before2009[, colSums(is.na(before2009)) <= 100 | colnames(before2009) == "SalePrice"]
# Drop Id and Utilities columns
before2009 = subset(before2009, select = -c(Id, Utilities))
# Print first 10 rows to verify
head(before2009, 10)
## # A tibble: 10 × 69
## ...1 MSSubClass MSZoning LotArea Street LotShape LandContour LotConfig
## <dbl> <fct> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 60 RL 8450 Pave Reg Lvl Inside
## 2 2 20 RL 9600 Pave Reg Lvl FR2
## 3 3 60 RL 11250 Pave IR1 Lvl Inside
## 4 4 70 RL 9550 Pave IR1 Lvl Corner
## 5 5 60 RL 14260 Pave IR1 Lvl FR2
## 6 6 20 RL 10084 Pave Reg Lvl Inside
## 7 7 50 RM 6120 Pave Reg Lvl Inside
## 8 8 190 RL 7420 Pave Reg Lvl Corner
## 9 9 20 RL 11200 Pave Reg Lvl Inside
## 10 10 60 RL 11924 Pave IR1 Lvl Inside
## # ℹ 61 more variables: LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>,
## # Condition2 <chr>, BldgType <chr>, HouseStyle <chr>, OverallQual <ord>,
## # OverallCond <ord>, YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>,
## # RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## # MasVnrArea <dbl>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## # BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## # BsmtFinSF1 <dbl>, BsmtFinType2 <chr>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, …
# Conduct multiple linear regression with SalePrice as the response
regBefore2009 = lm(SalePrice ~ ., data = before2009)
# Print summary to verify results
summary(regBefore2009)
##
## Call:
## lm(formula = SalePrice ~ ., data = before2009)
##
## Residuals:
## Min 1Q Median 3Q Max
## -178538 -4758 -10 4645 159265
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.979e+06 1.036e+06 -1.911 0.056228 .
## ...1 -2.001e-01 7.139e-01 -0.280 0.779316
## MSSubClass30 -4.076e+02 2.885e+03 -0.141 0.887651
## MSSubClass40 4.599e+01 8.916e+03 0.005 0.995885
## MSSubClass45 6.516e+03 1.211e+04 0.538 0.590536
## MSSubClass50 3.835e+03 5.220e+03 0.735 0.462620
## MSSubClass60 2.177e+03 5.185e+03 0.420 0.674623
## MSSubClass70 4.484e+03 5.356e+03 0.837 0.402595
## MSSubClass75 1.954e+03 7.860e+03 0.249 0.803755
## MSSubClass80 -1.103e+04 7.484e+03 -1.474 0.140670
## MSSubClass85 -8.479e+03 5.963e+03 -1.422 0.155256
## MSSubClass90 -1.286e+04 4.923e+03 -2.612 0.009077 **
## MSSubClass120 -9.343e+03 7.154e+03 -1.306 0.191759
## MSSubClass150 3.250e+03 1.938e+04 0.168 0.866816
## MSSubClass160 -4.939e+03 9.193e+03 -0.537 0.591129
## MSSubClass180 -8.511e+03 1.003e+04 -0.848 0.396372
## MSSubClass190 -6.635e+03 1.293e+04 -0.513 0.608027
## MSZoningFV 4.122e+04 7.309e+03 5.640 2.01e-08 ***
## MSZoningRH 2.680e+04 7.555e+03 3.547 0.000401 ***
## MSZoningRL 3.068e+04 6.286e+03 4.880 1.16e-06 ***
## MSZoningRM 3.081e+04 5.906e+03 5.216 2.06e-07 ***
## LotArea 5.397e-01 7.872e-02 6.856 1.00e-11 ***
## StreetPave 2.893e+04 6.801e+03 4.253 2.23e-05 ***
## LotShapeIR2 5.045e+03 2.509e+03 2.010 0.044552 *
## LotShapeIR3 7.957e+03 5.077e+03 1.567 0.117219
## LotShapeReg 6.244e+02 9.637e+02 0.648 0.517100
## LandContourHLS 1.128e+04 2.964e+03 3.806 0.000146 ***
## LandContourLow 4.217e+02 4.091e+03 0.103 0.917916
## LandContourLvl 1.029e+04 2.187e+03 4.702 2.79e-06 ***
## LotConfigCulDSac 6.668e+03 1.971e+03 3.382 0.000736 ***
## LotConfigFR2 -8.933e+03 2.640e+03 -3.383 0.000734 ***
## LotConfigFR3 -1.354e+04 5.142e+03 -2.632 0.008562 **
## LotConfigInside -3.401e+03 1.091e+03 -3.117 0.001859 **
## LandSlopeMod 1.105e+04 2.446e+03 4.519 6.65e-06 ***
## LandSlopeSev -1.999e+04 7.348e+03 -2.721 0.006575 **
## NeighborhoodBlueste -1.050e+04 9.841e+03 -1.067 0.286085
## NeighborhoodBrDale -2.398e+03 6.859e+03 -0.350 0.726695
## NeighborhoodBrkSide -9.373e+03 5.554e+03 -1.688 0.091661 .
## NeighborhoodClearCr -2.004e+04 5.877e+03 -3.410 0.000666 ***
## NeighborhoodCollgCr -1.606e+04 4.337e+03 -3.704 0.000219 ***
## NeighborhoodCrawfor 6.379e+03 5.039e+03 1.266 0.205724
## NeighborhoodEdwards -2.396e+04 4.793e+03 -5.000 6.35e-07 ***
## NeighborhoodGilbert -1.691e+04 4.613e+03 -3.665 0.000255 ***
## NeighborhoodIDOTRR -1.664e+04 6.043e+03 -2.753 0.005963 **
## NeighborhoodMeadowV -2.073e+04 6.895e+03 -3.007 0.002683 **
## NeighborhoodMitchel -2.794e+04 4.856e+03 -5.753 1.05e-08 ***
## NeighborhoodNAmes -2.077e+04 4.676e+03 -4.441 9.54e-06 ***
## NeighborhoodNoRidge 1.610e+04 5.123e+03 3.143 0.001703 **
## NeighborhoodNPkVill 8.668e+03 1.038e+04 0.835 0.403900
## NeighborhoodNridgHt 1.428e+04 4.516e+03 3.161 0.001601 **
## NeighborhoodNWAmes -2.351e+04 4.817e+03 -4.880 1.17e-06 ***
## NeighborhoodOldTown -2.152e+04 5.558e+03 -3.871 0.000113 ***
## NeighborhoodSawyer -1.595e+04 4.850e+03 -3.289 0.001028 **
## NeighborhoodSawyerW -1.082e+04 4.746e+03 -2.280 0.022746 *
## NeighborhoodSomerst -1.343e+04 5.278e+03 -2.544 0.011049 *
## NeighborhoodStoneBr 2.972e+04 5.140e+03 5.782 8.85e-09 ***
## NeighborhoodSWISU -1.141e+04 5.950e+03 -1.918 0.055330 .
## NeighborhoodTimber -1.174e+04 4.844e+03 -2.424 0.015457 *
## NeighborhoodVeenker -3.820e+03 5.876e+03 -0.650 0.515734
## Condition1Feedr 2.234e+03 3.065e+03 0.729 0.466271
## Condition1Norm 1.271e+04 2.568e+03 4.950 8.20e-07 ***
## Condition1PosA 1.150e+04 5.847e+03 1.967 0.049411 *
## Condition1PosN 7.208e+03 4.654e+03 1.549 0.121596
## Condition1RRAe -1.508e+04 4.834e+03 -3.121 0.001837 **
## Condition1RRAn 1.055e+04 3.974e+03 2.654 0.008022 **
## Condition1RRNe -1.997e+03 8.856e+03 -0.225 0.821646
## Condition1RRNn 7.427e+03 9.379e+03 0.792 0.428546
## Condition2Feedr -1.153e+04 1.047e+04 -1.101 0.270935
## Condition2Norm -4.704e+03 9.203e+03 -0.511 0.609289
## Condition2PosA -6.917e+03 1.457e+04 -0.475 0.635070
## Condition2PosN -2.458e+05 1.375e+04 -17.880 < 2e-16 ***
## Condition2RRAe -1.128e+05 2.415e+04 -4.673 3.21e-06 ***
## Condition2RRAn -1.028e+04 1.880e+04 -0.547 0.584608
## Condition2RRNn -6.878e+02 1.485e+04 -0.046 0.963068
## BldgType2fmCon -4.073e+03 1.295e+04 -0.315 0.753087
## BldgTypeDuplex NA NA NA NA
## BldgTypeTwnhs -1.669e+04 7.715e+03 -2.163 0.030686 *
## BldgTypeTwnhsE -1.623e+04 7.127e+03 -2.277 0.022927 *
## HouseStyle1.5Unf 7.616e+03 1.129e+04 0.675 0.499938
## HouseStyle1Story 1.116e+04 5.275e+03 2.116 0.034466 *
## HouseStyle2.5Fin -1.326e+04 1.044e+04 -1.270 0.204150
## HouseStyle2.5Unf -9.051e+03 7.513e+03 -1.205 0.228491
## HouseStyle2Story -7.049e+03 5.112e+03 -1.379 0.168116
## HouseStyleSFoyer 1.372e+04 6.885e+03 1.993 0.046394 *
## HouseStyleSLvl 1.542e+04 8.062e+03 1.913 0.055967 .
## OverallQual.L 7.995e+04 7.197e+03 11.109 < 2e-16 ***
## OverallQual.Q 3.879e+04 6.133e+03 6.324 3.29e-10 ***
## OverallQual.C 1.851e+04 4.894e+03 3.783 0.000161 ***
## OverallQual^4 7.991e+03 3.931e+03 2.033 0.042227 *
## OverallQual^5 4.060e+03 3.105e+03 1.308 0.191135
## OverallQual^6 1.497e+03 2.273e+03 0.659 0.510296
## OverallQual^7 1.208e+03 1.502e+03 0.804 0.421441
## OverallQual^8 4.453e+01 9.513e+02 0.047 0.962671
## OverallCond.L 5.466e+04 1.154e+04 4.738 2.35e-06 ***
## OverallCond.Q -4.500e+03 8.535e+03 -0.527 0.598101
## OverallCond.C 3.524e+03 8.176e+03 0.431 0.666500
## OverallCond^4 -5.331e+02 1.069e+04 -0.050 0.960235
## OverallCond^5 1.907e+03 1.075e+04 0.177 0.859178
## OverallCond^6 -2.832e+01 8.149e+03 -0.003 0.997227
## OverallCond^7 -1.228e+03 4.731e+03 -0.260 0.795227
## OverallCond^8 8.598e+02 2.167e+03 0.397 0.691597
## YearBuilt 3.354e+02 5.063e+01 6.625 4.72e-11 ***
## YearRemodAdd 1.000e+02 3.359e+01 2.978 0.002949 **
## RoofStyleGable -2.570e+03 8.788e+03 -0.292 0.770018
## RoofStyleGambrel 7.252e+02 9.865e+03 0.074 0.941410
## RoofStyleHip -1.456e+03 8.835e+03 -0.165 0.869113
## RoofStyleMansard 1.046e+04 1.114e+04 0.939 0.347895
## RoofStyleShed 8.737e+04 1.545e+04 5.656 1.82e-08 ***
## RoofMatlCompShg 6.588e+05 2.077e+04 31.715 < 2e-16 ***
## RoofMatlMembran 7.337e+05 2.954e+04 24.839 < 2e-16 ***
## RoofMatlMetal 6.956e+05 2.924e+04 23.790 < 2e-16 ***
## RoofMatlRoll 6.540e+05 2.704e+04 24.185 < 2e-16 ***
## RoofMatlTar&Grv 6.638e+05 2.230e+04 29.765 < 2e-16 ***
## RoofMatlWdShake 6.405e+05 2.212e+04 28.956 < 2e-16 ***
## RoofMatlWdShngl 7.384e+05 2.191e+04 33.695 < 2e-16 ***
## Exterior1stAsphShn -1.658e+04 2.350e+04 -0.706 0.480461
## Exterior1stBrkComm -1.030e+04 1.485e+04 -0.694 0.487916
## Exterior1stBrkFace 7.470e+03 8.884e+03 0.841 0.400546
## Exterior1stCemntBd -1.161e+04 1.261e+04 -0.921 0.357418
## Exterior1stHdBoard -1.121e+04 8.818e+03 -1.271 0.203762
## Exterior1stImStucc -6.537e+04 1.881e+04 -3.476 0.000523 ***
## Exterior1stMetalSd -1.660e+03 9.484e+03 -0.175 0.861119
## Exterior1stPlywood -1.569e+04 8.759e+03 -1.792 0.073375 .
## Exterior1stStone -1.689e+04 2.040e+04 -0.828 0.407636
## Exterior1stStucco -7.786e+03 9.777e+03 -0.796 0.425892
## Exterior1stVinylSd -1.640e+04 9.652e+03 -1.699 0.089498 .
## Exterior1stWd Sdng -9.763e+03 8.612e+03 -1.134 0.257119
## Exterior1stWdShing -5.095e+03 9.094e+03 -0.560 0.575345
## Exterior2ndAsphShn 3.134e+03 1.528e+04 0.205 0.837558
## Exterior2ndBrk Cmn 3.700e+03 1.461e+04 0.253 0.800113
## Exterior2ndBrkFace 2.171e+02 9.677e+03 0.022 0.982108
## Exterior2ndCmentBd 8.848e+03 1.297e+04 0.682 0.495339
## Exterior2ndHdBoard 2.825e+03 9.223e+03 0.306 0.759403
## Exterior2ndImStucc 3.346e+04 1.029e+04 3.250 0.001176 **
## Exterior2ndMetalSd -1.437e+03 9.819e+03 -0.146 0.883684
## Exterior2ndOther -9.966e+03 1.908e+04 -0.522 0.601439
## Exterior2ndPlywood 3.444e+03 9.003e+03 0.383 0.702090
## Exterior2ndStone 9.449e+02 2.643e+04 0.036 0.971485
## Exterior2ndStucco 3.391e+00 1.000e+04 0.000 0.999729
## Exterior2ndVinylSd 1.063e+04 9.968e+03 1.066 0.286414
## Exterior2ndWd Sdng 5.274e+03 9.047e+03 0.583 0.560025
## Exterior2ndWd Shng -3.541e+03 9.434e+03 -0.375 0.707411
## MasVnrTypeBrkFace 8.651e+03 4.308e+03 2.008 0.044764 *
## MasVnrTypeNone 1.170e+04 4.333e+03 2.700 0.006999 **
## MasVnrTypeStone 1.263e+04 4.524e+03 2.792 0.005306 **
## MasVnrArea 1.745e+01 3.429e+00 5.087 4.06e-07 ***
## ExterQualFa 1.738e+04 6.660e+03 2.609 0.009159 **
## ExterQualGd -7.165e+03 3.155e+03 -2.271 0.023267 *
## ExterQualTA -8.687e+03 3.467e+03 -2.505 0.012327 *
## ExterCondFa -1.164e+03 7.480e+03 -0.156 0.876357
## ExterCondGd -8.352e+03 6.786e+03 -1.231 0.218571
## ExterCondTA -5.100e+03 6.766e+03 -0.754 0.451071
## FoundationCBlock 2.161e+03 1.876e+03 1.152 0.249554
## FoundationPConc 6.421e+03 2.010e+03 3.195 0.001427 **
## FoundationStone 3.803e+03 8.048e+03 0.473 0.636598
## FoundationWood -2.113e+04 1.190e+04 -1.775 0.076083 .
## BsmtQualFa -6.816e+03 3.493e+03 -1.951 0.051208 .
## BsmtQualGd -8.203e+03 1.947e+03 -4.212 2.67e-05 ***
## BsmtQualTA -6.553e+03 2.450e+03 -2.675 0.007559 **
## BsmtCondGd -3.628e+02 2.975e+03 -0.122 0.902960
## BsmtCondPo -3.526e+03 1.677e+04 -0.210 0.833552
## BsmtCondTA 2.853e+03 2.357e+03 1.210 0.226316
## BsmtExposureGd 1.760e+03 1.696e+03 1.038 0.299573
## BsmtExposureMn -2.947e+03 1.743e+03 -1.691 0.091072 .
## BsmtExposureNo -3.732e+03 1.315e+03 -2.837 0.004611 **
## BsmtFinType1BLQ -4.615e+02 1.681e+03 -0.275 0.783701
## BsmtFinType1GLQ 1.793e+03 1.494e+03 1.200 0.230358
## BsmtFinType1LwQ -2.067e+03 2.124e+03 -0.973 0.330670
## BsmtFinType1Rec -2.125e+03 1.661e+03 -1.280 0.200876
## BsmtFinType1Unf -6.962e+02 1.704e+03 -0.409 0.682843
## BsmtFinSF1 3.275e+01 2.902e+00 11.285 < 2e-16 ***
## BsmtFinType2BLQ -4.968e+03 4.278e+03 -1.161 0.245695
## BsmtFinType2GLQ -4.276e+03 4.811e+03 -0.889 0.374282
## BsmtFinType2LwQ -6.677e+03 4.074e+03 -1.639 0.101460
## BsmtFinType2Rec -5.203e+03 3.930e+03 -1.324 0.185711
## BsmtFinType2Unf -1.494e+03 4.173e+03 -0.358 0.720389
## BsmtFinSF2 2.707e+01 5.120e+00 5.287 1.41e-07 ***
## BsmtUnfSF 1.373e+01 2.658e+00 5.166 2.69e-07 ***
## TotalBsmtSF NA NA NA NA
## HeatingGasW -8.469e+03 4.493e+03 -1.885 0.059598 .
## HeatingGrav -5.136e+03 8.868e+03 -0.579 0.562550
## HeatingOthW -2.774e+04 1.234e+04 -2.248 0.024710 *
## HeatingQCFa -1.811e+03 2.774e+03 -0.653 0.513827
## HeatingQCGd -3.584e+03 1.202e+03 -2.980 0.002923 **
## HeatingQCPo 1.080e+04 1.261e+04 0.857 0.391825
## HeatingQCTA -3.653e+03 1.216e+03 -3.004 0.002706 **
## CentralAirY 1.928e+02 2.224e+03 0.087 0.930912
## ElectricalFuseF -2.528e+03 3.758e+03 -0.673 0.501197
## ElectricalFuseP -9.087e+03 7.764e+03 -1.170 0.241990
## ElectricalMix 1.082e+04 3.276e+04 0.330 0.741215
## ElectricalSBrkr -1.345e+03 1.796e+03 -0.749 0.454141
## X1stFlrSF 5.305e+01 3.083e+00 17.207 < 2e-16 ***
## X2ndFlrSF 6.563e+01 3.139e+00 20.910 < 2e-16 ***
## LowQualFinSF 1.285e+01 1.117e+01 1.151 0.250101
## GrLivArea NA NA NA NA
## BsmtFullBath 1.677e+03 1.168e+03 1.436 0.151213
## BsmtHalfBath 4.271e+02 1.679e+03 0.254 0.799179
## FullBath 3.899e+03 1.335e+03 2.921 0.003532 **
## HalfBath 1.827e+02 1.279e+03 0.143 0.886482
## BedroomAbvGr -3.131e+03 8.360e+02 -3.745 0.000187 ***
## KitchenAbvGr -7.469e+03 4.300e+03 -1.737 0.082566 .
## KitchenQualFa -1.486e+04 3.748e+03 -3.963 7.72e-05 ***
## KitchenQualGd -1.960e+04 2.175e+03 -9.015 < 2e-16 ***
## KitchenQualTA -1.723e+04 2.413e+03 -7.142 1.38e-12 ***
## TotRmsAbvGrd 6.885e+02 5.737e+02 1.200 0.230333
## FunctionalMaj2 -2.119e+03 1.192e+04 -0.178 0.858921
## FunctionalMin1 3.818e+03 5.803e+03 0.658 0.510726
## FunctionalMin2 5.728e+03 6.052e+03 0.946 0.344092
## FunctionalMod -5.051e+03 6.566e+03 -0.769 0.441784
## FunctionalSev -4.922e+04 1.859e+04 -2.648 0.008168 **
## FunctionalTyp 1.642e+04 5.151e+03 3.187 0.001466 **
## Fireplaces 4.245e+03 8.150e+02 5.209 2.15e-07 ***
## GarageCars 2.358e+03 1.311e+03 1.799 0.072180 .
## GarageArea 1.766e+01 4.447e+00 3.972 7.43e-05 ***
## PavedDriveP -3.387e+03 3.110e+03 -1.089 0.276296
## PavedDriveY -1.302e+03 2.025e+03 -0.643 0.520199
## WoodDeckSF 1.327e+01 3.461e+00 3.836 0.000130 ***
## OpenPorchSF 1.465e+01 6.522e+00 2.246 0.024834 *
## EnclosedPorch 3.246e+00 6.949e+00 0.467 0.640530
## X3SsnPorch 5.795e+01 1.758e+01 3.297 0.000998 ***
## ScreenPorch 2.329e+01 7.240e+00 3.217 0.001321 **
## PoolArea 6.857e+01 1.018e+01 6.736 2.25e-11 ***
## MiscVal -4.092e-01 6.633e-01 -0.617 0.537426
## MoSold -5.282e+02 1.438e+02 -3.672 0.000249 ***
## YrSold 2.363e+02 5.126e+02 0.461 0.644787
## SaleTypeCon 3.746e+04 9.798e+03 3.823 0.000137 ***
## SaleTypeConLD 1.199e+04 5.332e+03 2.249 0.024674 *
## SaleTypeConLI -3.401e+03 1.008e+04 -0.337 0.735798
## SaleTypeConLw -4.058e+02 8.787e+03 -0.046 0.963174
## SaleTypeCWD 2.101e+04 5.381e+03 3.904 9.86e-05 ***
## SaleTypeNew 1.231e+04 8.920e+03 1.380 0.167677
## SaleTypeOth 9.717e+03 9.802e+03 0.991 0.321657
## SaleTypeWD -1.370e+03 2.571e+03 -0.533 0.594137
## SaleConditionAdjLand 9.748e+03 6.362e+03 1.532 0.125645
## SaleConditionAlloca 8.512e+03 6.135e+03 1.387 0.165492
## SaleConditionFamily -2.448e+02 3.260e+03 -0.075 0.940153
## SaleConditionNormal 4.339e+03 1.736e+03 2.499 0.012568 *
## SaleConditionPartial 7.276e+03 8.540e+03 0.852 0.394360
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15630 on 1616 degrees of freedom
## (82 observations deleted due to missingness)
## Multiple R-squared: 0.9662, Adjusted R-squared: 0.9613
## F-statistic: 197.4 on 234 and 1616 DF, p-value: < 2.2e-16
# R-squared = 0.9662: The model explains 96.6% of the variance in house prices before 2009
# Adjusted R-squared = 0.9613
# F-statistic: Highly significant p-value: < 2.2e-16
# Looking at the above output, coefficients with P-values less than .05 and significance flags (*, **, ***) were identified. Physical and quality related variables showed very low p-values.
# Variables grouped by category (e.g. multiple neighborhood variables into one neighborhood category).
regBefore2009optimal = lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea +
TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
Fireplaces + LotArea + BsmtFinSF1 + KitchenQual +
Neighborhood + ExterQual + X1stFlrSF, data = before2009)
summary(regBefore2009optimal)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
## Fireplaces + LotArea + BsmtFinSF1 + KitchenQual + Neighborhood +
## ExterQual + X1stFlrSF, data = before2009)
##
## Residuals:
## Min 1Q Median 3Q Max
## -500606 -11301 1060 11433 209410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.828e+05 1.262e+05 -6.997 3.64e-12 ***
## OverallQual.L 1.342e+05 1.723e+04 7.793 1.08e-14 ***
## OverallQual.Q 4.380e+04 1.606e+04 2.728 0.006429 **
## OverallQual.C 1.714e+04 1.387e+04 1.236 0.216709
## OverallQual^4 1.015e+04 1.141e+04 0.889 0.374106
## OverallQual^5 -7.806e+03 9.180e+03 -0.850 0.395290
## OverallQual^6 4.627e+03 7.023e+03 0.659 0.510102
## OverallQual^7 7.792e+02 4.986e+03 0.156 0.875825
## OverallQual^8 1.298e+03 3.231e+03 0.402 0.687990
## OverallQual^9 2.951e+02 1.860e+03 0.159 0.873982
## GrLivArea 4.257e+01 2.238e+00 19.021 < 2e-16 ***
## GarageCars 8.710e+03 2.078e+03 4.190 2.91e-05 ***
## GarageArea 8.002e+00 7.148e+00 1.120 0.263057
## TotalBsmtSF 5.077e+00 2.883e+00 1.761 0.078407 .
## FullBath 1.492e+02 1.825e+03 0.082 0.934877
## YearBuilt 2.502e+02 5.062e+01 4.943 8.40e-07 ***
## YearRemodAdd 2.464e+02 4.668e+01 5.278 1.46e-07 ***
## Fireplaces 7.716e+03 1.270e+03 6.078 1.48e-09 ***
## LotArea 4.237e-01 9.938e-02 4.263 2.12e-05 ***
## BsmtFinSF1 1.457e+01 1.788e+00 8.152 6.50e-16 ***
## KitchenQualFa -2.481e+04 6.012e+03 -4.126 3.85e-05 ***
## KitchenQualGd -2.356e+04 3.599e+03 -6.545 7.66e-11 ***
## KitchenQualTA -2.650e+04 3.952e+03 -6.706 2.65e-11 ***
## NeighborhoodBlueste -2.440e+04 1.545e+04 -1.579 0.114407
## NeighborhoodBrDale -2.125e+04 9.298e+03 -2.285 0.022414 *
## NeighborhoodBrkSide 6.000e+03 7.953e+03 0.754 0.450663
## NeighborhoodClearCr 1.345e+04 8.601e+03 1.564 0.117931
## NeighborhoodCollgCr 5.298e+03 6.731e+03 0.787 0.431293
## NeighborhoodCrawfor 2.618e+04 7.662e+03 3.418 0.000645 ***
## NeighborhoodEdwards -1.292e+04 7.347e+03 -1.758 0.078946 .
## NeighborhoodGilbert -4.907e+02 7.055e+03 -0.070 0.944556
## NeighborhoodIDOTRR -8.049e+03 8.110e+03 -0.992 0.321115
## NeighborhoodMeadowV -2.180e+04 9.018e+03 -2.418 0.015711 *
## NeighborhoodMitchel -8.006e+03 7.451e+03 -1.075 0.282725
## NeighborhoodNAmes -1.839e+03 7.067e+03 -0.260 0.794740
## NeighborhoodNoRidge 4.933e+04 7.856e+03 6.280 4.21e-10 ***
## NeighborhoodNPkVill -7.464e+03 1.144e+04 -0.652 0.514223
## NeighborhoodNridgHt 3.338e+04 7.219e+03 4.624 4.02e-06 ***
## NeighborhoodNWAmes -3.209e+03 7.293e+03 -0.440 0.659995
## NeighborhoodOldTown -6.940e+03 7.724e+03 -0.899 0.369023
## NeighborhoodSawyer -5.395e+03 7.391e+03 -0.730 0.465525
## NeighborhoodSawyerW 2.589e+03 7.338e+03 0.353 0.724234
## NeighborhoodSomerst 5.698e+03 6.933e+03 0.822 0.411276
## NeighborhoodStoneBr 3.892e+04 8.103e+03 4.803 1.69e-06 ***
## NeighborhoodSWISU 1.887e+03 9.097e+03 0.207 0.835658
## NeighborhoodTimber 1.110e+04 7.577e+03 1.466 0.142914
## NeighborhoodVeenker 2.919e+04 9.011e+03 3.239 0.001221 **
## ExterQualFa 1.023e+03 9.546e+03 0.107 0.914659
## ExterQualGd -6.376e+03 5.145e+03 -1.239 0.215397
## ExterQualTA -6.487e+03 5.618e+03 -1.155 0.248307
## X1stFlrSF 5.766e+00 3.327e+00 1.733 0.083238 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27640 on 1859 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.8833, Adjusted R-squared: 0.8801
## F-statistic: 281.3 on 50 and 1859 DF, p-value: < 2.2e-16
# Multiple R-squared: 0.8816, Adjusted R-squared: 0.8784, F-statistic: 273.8 on 49 and 1801 DF, p-value: < 2.2e-16.
# The predictors explained approximately 88%.
library(ggplot2)
library(ggfortify)
# Display diagnostic plots for the regression model
autoplot(regBefore2009optimal)
## Warning: `fortify(<lm>)` was deprecated in ggplot2 3.6.0.
## ℹ Please use `broom::augment(<lm>)` instead.
## ℹ The deprecated feature was likely used in the ggfortify package.
## Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggfortify package.
## Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the ggfortify package.
## Please report the issue at <https://github.com/sinhrks/ggfortify/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 409 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# Load the after2009 dataset
after2009 = read_csv("PricesAfter2009.csv")
## New names:
## Rows: 986 Columns: 82
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (42): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf... dbl
## (39): ...1, Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCo... lgl
## (1): PoolQC
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# Convert MSSubClass, OverallQual, and OverallCond to factors (same as before2009)
after2009$MSSubClass = as.factor(after2009$MSSubClass)
after2009$OverallQual = as.factor(after2009$OverallQual)
after2009$OverallCond = as.factor(after2009$OverallCond)
# OverallQual and OverallCond are ordinal (rating scales)
after2009$OverallQual <- ordered(after2009$OverallQual)
after2009$OverallCond <- ordered(after2009$OverallCond)
# Drop columns (except SalePrice) that have more than 100 missing values
after2009 = after2009[, colSums(is.na(after2009)) <= 100 | colnames(after2009) == "SalePrice"]
# Drop Id and Utilities columns
after2009 = subset(after2009, select = -c(Id, Utilities))
# Print first 10 rows to verify
head(after2009, 10)
## # A tibble: 10 × 74
## ...1 MSSubClass MSZoning LotArea Street LotShape LandContour LotConfig
## <dbl> <fct> <chr> <dbl> <chr> <chr> <chr> <chr>
## 1 1 50 RL 14115 Pave IR1 Lvl Inside
## 2 2 60 RL 10382 Pave IR1 Lvl Corner
## 3 3 20 RL 11241 Pave IR1 Lvl CulDSac
## 4 4 20 RL 7560 Pave Reg Lvl Inside
## 5 5 20 RL 8246 Pave IR1 Lvl Inside
## 6 6 20 RL 14230 Pave Reg Lvl Corner
## 7 7 20 RL 7200 Pave Reg Lvl Corner
## 8 8 20 RL 11478 Pave Reg Lvl Inside
## 9 9 20 RL 10552 Pave IR1 Lvl Inside
## 10 10 20 RL 10859 Pave Reg Lvl Corner
## # ℹ 66 more variables: LandSlope <chr>, Neighborhood <chr>, Condition1 <chr>,
## # Condition2 <chr>, BldgType <chr>, HouseStyle <chr>, OverallQual <ord>,
## # OverallCond <ord>, YearBuilt <dbl>, YearRemodAdd <dbl>, RoofStyle <chr>,
## # RoofMatl <chr>, Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## # MasVnrArea <dbl>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## # BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>, BsmtFinType1 <chr>,
## # BsmtFinSF1 <dbl>, BsmtFinType2 <chr>, BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, …
library(ggplot2)
# Create a density plot of SalePrice for all neighborhoods after 2009
# Used free_y to allow each neighborhood to have its own y-axis
# X axis is Sale Price, Y axis shows relative frequency via density
ggplot(after2009, aes(x = SalePrice)) +
geom_density(fill = "red", alpha = 0.5, color = "black") +
facet_wrap(~ Neighborhood, scales = "free_y") +
scale_x_continuous(labels = scales::comma, breaks = seq(0, 500000, 100000)) +
ggtitle("Density Plot of Sale Prices by Neighborhood (After 2009)") +
xlab("Sale Price ($)") +
ylab("Density") +
theme_minimal() +
theme(axis.text.x = element_text(size = 6))
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).

# The majority of neighborhoods display smooth, single-peaked price distributions. Difficult to see any fraud activity at this level.
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
# Density plot for NAmes before 2009
pBefore2009NAmes = ggplot(before2009[before2009$Neighborhood == "NAmes", ], aes(x = SalePrice)) +
geom_density(fill = "yellow", alpha = 0.5) +
ggtitle("Density Plot NAmes Before 2009") +
xlab("Sale Price") +
xlim(0, 800000) +
scale_x_continuous(labels = scales::comma) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Density plot for NAmes after 2009
pAfter2009NAmes = ggplot(after2009[after2009$Neighborhood == "NAmes", ], aes(x = SalePrice)) +
geom_density(fill = "orange", alpha = 0.5) +
ggtitle("Density Plot NAmes After 2009") +
xlab("Sale Price") +
xlim(0, 800000) +
scale_x_continuous(labels = scales::comma) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Density plot for Gilbert before 2009
pBefore2009Gilbert = ggplot(before2009[before2009$Neighborhood == "Gilbert", ], aes(x = SalePrice)) +
geom_density(fill = "lightgreen", alpha = 0.5) +
ggtitle("Density Plot Gilbert Before 2009") +
xlab("Sale Price") +
xlim(0, 800000) +
scale_x_continuous(labels = scales::comma) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Density plot for Gilbert after 2009
pAfter2009Gilbert = ggplot(after2009[after2009$Neighborhood == "Gilbert", ], aes(x = SalePrice)) +
geom_density(fill = "lightblue", alpha = 0.5) +
ggtitle("Density Plot Gilbert After 2009") +
xlab("Sale Price") +
xlim(0, 800000) +
scale_x_continuous(labels = scales::comma) +
theme_minimal()
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
grid.arrange(pBefore2009NAmes, pAfter2009NAmes, pBefore2009Gilbert, pAfter2009Gilbert, nrow = 2)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_density()`).

# NAmes shows no clear signs of pricing anomalies or fraud. Prices remained consistent before and after 2009, suggesting stable market behavior.
# Gilbert's after2009 curve suggests possible irregularities. The sharper peak and multiple bumps indicate sales concentrated at unusual price points, a pattern consistent with fraudulent activity clustering prices around artificial targets.
# Run multiple linear regression on after2009 using the same variables as before2009optimal
regAfter2009optimal = lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea +
TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
Fireplaces + LotArea + BsmtFinSF1 + KitchenQual +
Neighborhood + ExterQual + X1stFlrSF, data = after2009)
summary(regAfter2009optimal)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
## Fireplaces + LotArea + BsmtFinSF1 + KitchenQual + Neighborhood +
## ExterQual + X1stFlrSF, data = after2009)
##
## Residuals:
## Min 1Q Median 3Q Max
## -270665 -8685 1348 11161 205033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.067e+06 1.877e+05 -5.687 1.73e-08 ***
## OverallQual.L 1.004e+05 1.948e+04 5.155 3.10e-07 ***
## OverallQual.Q 4.323e+04 1.781e+04 2.427 0.01542 *
## OverallQual.C 1.526e+04 1.489e+04 1.025 0.30563
## OverallQual^4 1.088e+04 1.198e+04 0.909 0.36384
## OverallQual^5 1.379e+03 9.267e+03 0.149 0.88176
## OverallQual^6 2.161e+03 7.357e+03 0.294 0.76906
## OverallQual^7 1.263e+03 5.768e+03 0.219 0.82670
## OverallQual^8 -8.692e+02 4.125e+03 -0.211 0.83317
## OverallQual^9 1.147e+03 2.541e+03 0.451 0.65183
## GrLivArea 4.609e+01 3.335e+00 13.822 < 2e-16 ***
## GarageCars 3.140e+03 2.955e+03 1.062 0.28829
## GarageArea 1.906e+01 1.027e+01 1.856 0.06374 .
## TotalBsmtSF 1.857e+01 4.019e+00 4.622 4.34e-06 ***
## FullBath 1.890e+03 2.526e+03 0.748 0.45463
## YearBuilt 3.183e+02 7.503e+01 4.242 2.43e-05 ***
## YearRemodAdd 2.773e+02 6.573e+01 4.219 2.70e-05 ***
## Fireplaces 3.214e+03 1.764e+03 1.822 0.06882 .
## LotArea 8.393e-01 1.295e-01 6.482 1.47e-10 ***
## BsmtFinSF1 2.552e+01 2.536e+00 10.063 < 2e-16 ***
## KitchenQualFa -2.892e+04 8.816e+03 -3.281 0.00107 **
## KitchenQualGd -2.508e+04 5.746e+03 -4.364 1.42e-05 ***
## KitchenQualTA -2.845e+04 6.210e+03 -4.582 5.23e-06 ***
## NeighborhoodBlueste -2.578e+03 1.577e+04 -0.163 0.87020
## NeighborhoodBrDale 1.149e+04 1.456e+04 0.789 0.43031
## NeighborhoodBrkSide 2.711e+04 1.248e+04 2.172 0.03011 *
## NeighborhoodClearCr 2.756e+04 1.345e+04 2.048 0.04080 *
## NeighborhoodCollgCr 1.582e+04 1.082e+04 1.462 0.14405
## NeighborhoodCrawfor 4.999e+04 1.219e+04 4.102 4.46e-05 ***
## NeighborhoodEdwards 9.523e+03 1.159e+04 0.822 0.41140
## NeighborhoodGilbert 6.744e+03 1.107e+04 0.609 0.54254
## NeighborhoodIDOTRR 6.987e+03 1.360e+04 0.514 0.60763
## NeighborhoodMeadowV -3.260e+03 1.408e+04 -0.232 0.81693
## NeighborhoodMitchel 9.587e+03 1.154e+04 0.831 0.40639
## NeighborhoodNAmes 1.539e+04 1.121e+04 1.373 0.17012
## NeighborhoodNoRidge 4.836e+04 1.233e+04 3.922 9.42e-05 ***
## NeighborhoodNPkVill 7.519e+03 1.315e+04 0.572 0.56775
## NeighborhoodNridgHt 2.118e+04 1.123e+04 1.885 0.05977 .
## NeighborhoodNWAmes 1.102e+04 1.149e+04 0.959 0.33788
## NeighborhoodOldTown 1.001e+04 1.226e+04 0.816 0.41447
## NeighborhoodSawyer 1.102e+04 1.169e+04 0.943 0.34613
## NeighborhoodSawyerW 1.350e+04 1.104e+04 1.224 0.22142
## NeighborhoodSomerst 1.333e+04 1.097e+04 1.215 0.22452
## NeighborhoodStoneBr 5.216e+04 1.319e+04 3.955 8.23e-05 ***
## NeighborhoodSWISU 1.269e+04 1.324e+04 0.958 0.33827
## NeighborhoodTimber 1.701e+04 1.224e+04 1.390 0.16495
## NeighborhoodVeenker 3.652e+04 1.791e+04 2.039 0.04176 *
## ExterQualFa -3.475e+04 1.147e+04 -3.030 0.00251 **
## ExterQualGd -3.252e+04 7.601e+03 -4.279 2.07e-05 ***
## ExterQualTA -3.656e+04 8.230e+03 -4.442 9.97e-06 ***
## X1stFlrSF -1.233e+01 4.879e+00 -2.526 0.01169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28230 on 930 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8691, Adjusted R-squared: 0.8621
## F-statistic: 123.5 on 50 and 930 DF, p-value: < 2.2e-16
library(ggfortify)
autoplot(regAfter2009optimal)
## Warning: Removed 263 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The Residuals vs Fitted graph and Scale-Location graph show a clear deviation indicating possible fraud.
ggplot(data = regAfter2009optimal, aes(x = .fitted, y = .resid)) +
geom_point(color = "blue", alpha = 0.6) +
stat_smooth(method = "loess", color = "red", se = FALSE) +
ggtitle("Residuals vs Fitted Values (After 2009)") +
xlab("Fitted Values") +
ylab("Residuals")
## `geom_smooth()` using formula = 'y ~ x'

# The Residuals vs Fitted plot shows a line of outliers deviating towards the bottom right from the main cluster. These points represent potential outliers with unusually high or low residuals, indicating properties with sale prices much higher or lower than expected, suggesting possible data errors or fraudulent activity.
# Method: Use the before2009 model to generate realistic sale prices for flagged rows.
# Step 1: Identify rows in after2009 where old fraud occurred (SalePrice == 142769.7).
# Step 2: Use regBefore2009optimal to predict what those houses should have sold for based on their features.
# Step 3: Add small random variation (about +/-1%) to make prices look natural.
# Step 4: Replace only the flagged SalePrice values with the new model-based values.
# Rationale: Using the model ensures each replaced value makes sense for its specific house. Adding random noise simulates natural market fluctuations so values do not appear computer-generated.
# Density plots for NAmes and Gilbert before fraud fix.
# The red dashed line marks the artificially repeated price of $142,769.70.
# In Gilbert, the clustering produces a visible bump near the red line, suggesting tampering.
pFraudNAmes <- ggplot(after2009 %>% filter(Neighborhood == "NAmes"),
aes(x = SalePrice)) +
geom_density(fill = "yellow", alpha = 0.5, color = "black") +
geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
ggtitle("NAmes - Before Fraud Fix (Showing Fraud Spike)") +
xlab("Sale Price") +
ylab("Density") +
scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
theme_minimal()
pFraudGilbert <- ggplot(after2009 %>% filter(Neighborhood == "Gilbert"),
aes(x = SalePrice)) +
geom_density(fill = "lightgreen", alpha = 0.5, color = "black") +
geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
ggtitle("Gilbert - Before Fraud Fix (Showing Fraud Spike)") +
xlab("Sale Price") +
ylab("Density") +
scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
theme_minimal()
grid.arrange(pFraudNAmes, pFraudGilbert, nrow = 1)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

library(dplyr)
library(ggplot2)
library(gridExtra)
library(ggfortify)
# Find which rows have the suspicious SalePrice
flagged_rows_before2009 = which(before2009$SalePrice == 142769.7)
flagged_rows_after2009 = which(after2009$SalePrice == 142769.7)
flagged_rows_before2009
## integer(0)
flagged_rows_after2009
## [1] 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
## [20] 533 534 535 536 537 538 539 540 541 542 543
after2009[flagged_rows_after2009[1:30], c("Neighborhood", "SalePrice")]
## # A tibble: 30 × 2
## Neighborhood SalePrice
## <chr> <dbl>
## 1 NAmes 142770.
## 2 NAmes 142770.
## 3 Gilbert 142770.
## 4 Gilbert 142770.
## 5 StoneBr 142770.
## 6 Gilbert 142770.
## 7 Gilbert 142770.
## 8 Gilbert 142770.
## 9 Gilbert 142770.
## 10 NAmes 142770.
## # ℹ 20 more rows
# Use regBefore2009optimal to predict realistic prices for the flagged rows
predicted_prices = predict(regBefore2009optimal,
newdata = after2009[flagged_rows_after2009, ])
data.frame(Row = 1:length(predicted_prices), Predicted_Price = predicted_prices)
## Row Predicted_Price
## 1 1 113982.88
## 2 2 149850.86
## 3 3 185304.98
## 4 4 189740.28
## 5 5 237263.80
## 6 6 175960.88
## 7 7 168389.62
## 8 8 169338.74
## 9 9 191716.01
## 10 10 121195.49
## 11 11 199681.93
## 12 12 93329.86
## 13 13 93989.93
## 14 14 159492.99
## 15 15 116950.86
## 16 16 364909.75
## 17 17 265628.27
## 18 18 331055.77
## 19 19 309175.17
## 20 20 419873.82
## 21 21 308994.86
## 22 22 248445.75
## 23 23 172523.66
## 24 24 169879.98
## 25 25 188182.62
## 26 26 199920.35
## 27 27 326907.18
## 28 28 244842.21
## 29 29 185438.69
## 30 30 212098.56
summary(predicted_prices)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 93330 168627 188961 210469 247545 419874
fraud_neighborhoods = after2009[flagged_rows_after2009, c("Neighborhood", "GrLivArea", "OverallQual", "SalePrice")]
head(fraud_neighborhoods, 10)
## # A tibble: 10 × 4
## Neighborhood GrLivArea OverallQual SalePrice
## <chr> <dbl> <ord> <dbl>
## 1 NAmes 896 5 142770.
## 2 NAmes 1329 6 142770.
## 3 Gilbert 1629 5 142770.
## 4 Gilbert 1604 6 142770.
## 5 StoneBr 1280 8 142770.
## 6 Gilbert 1655 6 142770.
## 7 Gilbert 1187 6 142770.
## 8 Gilbert 1465 6 142770.
## 9 Gilbert 1341 7 142770.
## 10 NAmes 882 4 142770.
table(fraud_neighborhoods$Neighborhood)
##
## Blmngtn BrDale Gilbert NAmes NoRidge NPkVill NridgHt Somerst StoneBr
## 1 2 9 4 1 2 7 3 1
after2009_adjusted = after2009
# Set seed for reproducibility. Add small random noise (~1% SD) around predicted prices to simulate natural market variation.
set.seed(123)
noise = rnorm(length(predicted_prices),
mean = 0,
sd = predicted_prices * 0.01)
new_prices = predicted_prices + noise
after2009_adjusted = after2009
after2009_adjusted$SalePrice[flagged_rows_after2009] = new_prices
# Compare density plots before and after price adjustment
pBeforeNAmes <- ggplot(after2009 %>% filter(Neighborhood == "NAmes"),
aes(x = SalePrice)) +
geom_density(fill = "yellow", alpha = 0.5, color = "black") +
geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
ggtitle("NAmes - Before Fraud Fix (Spike @142,769)") +
xlab("Sale Price") + ylab("Density") +
scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
theme_minimal()
pAfterNAmes <- ggplot(after2009_adjusted %>% filter(Neighborhood == "NAmes"),
aes(x = SalePrice)) +
geom_density(fill = "orange", alpha = 0.5, color = "black") +
ggtitle("NAmes - After Fraud Fix (Smoothed)") +
xlab("Sale Price") + ylab("Density") +
scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
theme_minimal()
pBeforeGilbert <- ggplot(after2009 %>% filter(Neighborhood == "Gilbert"),
aes(x = SalePrice)) +
geom_density(fill = "lightgreen", alpha = 0.5, color = "black") +
geom_vline(xintercept = 142769.7, color = "red", linetype = "dashed", size = 1) +
ggtitle("Gilbert - Before Fraud Fix (Spike @142,769)") +
xlab("Sale Price") + ylab("Density") +
scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
theme_minimal()
pAfterGilbert <- ggplot(after2009_adjusted %>% filter(Neighborhood == "Gilbert"),
aes(x = SalePrice)) +
geom_density(fill = "lightblue", alpha = 0.5, color = "black") +
ggtitle("Gilbert - After Fraud Fix (Smoothed)") +
xlab("Sale Price") + ylab("Density") +
scale_x_continuous(labels = scales::comma, limits = c(100000, 350000)) +
theme_minimal()
grid.arrange(pBeforeNAmes, pAfterNAmes, pBeforeGilbert, pAfterGilbert, nrow = 2)
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 4 rows containing non-finite outside the scale range
## (`stat_density()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
## Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

# After applying the fraud fix, the spike at $142,769.70 disappears in both neighborhoods and the distribution becomes smooth, confirming the correction blended well with legitimate market data.
library(broom)
library(ggplot2)
regAfter2009_adjusted = lm(formula(regAfter2009optimal), data = after2009_adjusted)
# augment combines predicted values, residuals, and influence statistics for outlier visualization
ggplot(augment(regAfter2009_adjusted),
aes(x = .fitted, y = .resid)) +
geom_point(alpha = 0.6, color = "blue") +
stat_smooth(method = "loess", color = "red", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

# The Residuals vs Fitted plot confirms the fraud fix was effective. Residuals are now evenly scattered around zero with no visible clustering.
regAfter2009optimalFraud = lm(SalePrice ~ OverallQual + GrLivArea + GarageCars + GarageArea +
TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
Fireplaces + LotArea + BsmtFinSF1 + KitchenQual +
Neighborhood + ExterQual + X1stFlrSF, data = after2009)
summary(regAfter2009optimalFraud)
##
## Call:
## lm(formula = SalePrice ~ OverallQual + GrLivArea + GarageCars +
## GarageArea + TotalBsmtSF + FullBath + YearBuilt + YearRemodAdd +
## Fireplaces + LotArea + BsmtFinSF1 + KitchenQual + Neighborhood +
## ExterQual + X1stFlrSF, data = after2009)
##
## Residuals:
## Min 1Q Median 3Q Max
## -270665 -8685 1348 11161 205033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.067e+06 1.877e+05 -5.687 1.73e-08 ***
## OverallQual.L 1.004e+05 1.948e+04 5.155 3.10e-07 ***
## OverallQual.Q 4.323e+04 1.781e+04 2.427 0.01542 *
## OverallQual.C 1.526e+04 1.489e+04 1.025 0.30563
## OverallQual^4 1.088e+04 1.198e+04 0.909 0.36384
## OverallQual^5 1.379e+03 9.267e+03 0.149 0.88176
## OverallQual^6 2.161e+03 7.357e+03 0.294 0.76906
## OverallQual^7 1.263e+03 5.768e+03 0.219 0.82670
## OverallQual^8 -8.692e+02 4.125e+03 -0.211 0.83317
## OverallQual^9 1.147e+03 2.541e+03 0.451 0.65183
## GrLivArea 4.609e+01 3.335e+00 13.822 < 2e-16 ***
## GarageCars 3.140e+03 2.955e+03 1.062 0.28829
## GarageArea 1.906e+01 1.027e+01 1.856 0.06374 .
## TotalBsmtSF 1.857e+01 4.019e+00 4.622 4.34e-06 ***
## FullBath 1.890e+03 2.526e+03 0.748 0.45463
## YearBuilt 3.183e+02 7.503e+01 4.242 2.43e-05 ***
## YearRemodAdd 2.773e+02 6.573e+01 4.219 2.70e-05 ***
## Fireplaces 3.214e+03 1.764e+03 1.822 0.06882 .
## LotArea 8.393e-01 1.295e-01 6.482 1.47e-10 ***
## BsmtFinSF1 2.552e+01 2.536e+00 10.063 < 2e-16 ***
## KitchenQualFa -2.892e+04 8.816e+03 -3.281 0.00107 **
## KitchenQualGd -2.508e+04 5.746e+03 -4.364 1.42e-05 ***
## KitchenQualTA -2.845e+04 6.210e+03 -4.582 5.23e-06 ***
## NeighborhoodBlueste -2.578e+03 1.577e+04 -0.163 0.87020
## NeighborhoodBrDale 1.149e+04 1.456e+04 0.789 0.43031
## NeighborhoodBrkSide 2.711e+04 1.248e+04 2.172 0.03011 *
## NeighborhoodClearCr 2.756e+04 1.345e+04 2.048 0.04080 *
## NeighborhoodCollgCr 1.582e+04 1.082e+04 1.462 0.14405
## NeighborhoodCrawfor 4.999e+04 1.219e+04 4.102 4.46e-05 ***
## NeighborhoodEdwards 9.523e+03 1.159e+04 0.822 0.41140
## NeighborhoodGilbert 6.744e+03 1.107e+04 0.609 0.54254
## NeighborhoodIDOTRR 6.987e+03 1.360e+04 0.514 0.60763
## NeighborhoodMeadowV -3.260e+03 1.408e+04 -0.232 0.81693
## NeighborhoodMitchel 9.587e+03 1.154e+04 0.831 0.40639
## NeighborhoodNAmes 1.539e+04 1.121e+04 1.373 0.17012
## NeighborhoodNoRidge 4.836e+04 1.233e+04 3.922 9.42e-05 ***
## NeighborhoodNPkVill 7.519e+03 1.315e+04 0.572 0.56775
## NeighborhoodNridgHt 2.118e+04 1.123e+04 1.885 0.05977 .
## NeighborhoodNWAmes 1.102e+04 1.149e+04 0.959 0.33788
## NeighborhoodOldTown 1.001e+04 1.226e+04 0.816 0.41447
## NeighborhoodSawyer 1.102e+04 1.169e+04 0.943 0.34613
## NeighborhoodSawyerW 1.350e+04 1.104e+04 1.224 0.22142
## NeighborhoodSomerst 1.333e+04 1.097e+04 1.215 0.22452
## NeighborhoodStoneBr 5.216e+04 1.319e+04 3.955 8.23e-05 ***
## NeighborhoodSWISU 1.269e+04 1.324e+04 0.958 0.33827
## NeighborhoodTimber 1.701e+04 1.224e+04 1.390 0.16495
## NeighborhoodVeenker 3.652e+04 1.791e+04 2.039 0.04176 *
## ExterQualFa -3.475e+04 1.147e+04 -3.030 0.00251 **
## ExterQualGd -3.252e+04 7.601e+03 -4.279 2.07e-05 ***
## ExterQualTA -3.656e+04 8.230e+03 -4.442 9.97e-06 ***
## X1stFlrSF -1.233e+01 4.879e+00 -2.526 0.01169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28230 on 930 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8691, Adjusted R-squared: 0.8621
## F-statistic: 123.5 on 50 and 930 DF, p-value: < 2.2e-16
library(ggfortify)
autoplot(regAfter2009_adjusted)
## Warning: Removed 273 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

# In the diagnostic plots for regAfter2009_adjusted, a few points lie far from the main trend line in the Residuals vs Fitted and QQ plots. These represent potential outliers with sale prices that differ greatly from model predictions, possibly due to unusual property characteristics or remaining fraudulent reporting.