library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readr)
loanData = readRDS("LoanData.rds")
glimpse(loanData)
## Rows: 29,092
## Columns: 8
## $ isLoanDefault   <int> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, …
## $ loanAmount      <int> 5000, 2400, 10000, 5000, 3000, 12000, 9000, 3000, 1000…
## $ interestRate    <dbl> 10.65, NA, 13.49, NA, NA, 12.69, 13.49, 9.91, 10.65, 1…
## $ creditGrade     <fct> B, C, C, A, E, B, C, B, B, D, C, A, B, A, B, B, B, B, …
## $ employmentYears <int> 10, 25, 13, 3, 9, 11, 0, 3, 3, 0, 4, 13, 1, 6, 17, 13,…
## $ homeLiving      <fct> RENT, RENT, RENT, RENT, RENT, OWN, RENT, RENT, RENT, R…
## $ incomeAnnual    <dbl> 24000.00, 12252.00, 49200.00, 36000.00, 48000.00, 7500…
## $ ageYears        <int> 33, 31, 24, 39, 24, 28, 22, 22, 28, 22, 23, 27, 30, 24…
head(loanData)
##   isLoanDefault loanAmount interestRate creditGrade employmentYears homeLiving
## 1             0       5000        10.65           B              10       RENT
## 2             0       2400           NA           C              25       RENT
## 3             0      10000        13.49           C              13       RENT
## 4             0       5000           NA           A               3       RENT
## 5             0       3000           NA           E               9       RENT
## 6             0      12000        12.69           B              11        OWN
##   incomeAnnual ageYears
## 1        24000       33
## 2        12252       31
## 3        49200       24
## 4        36000       39
## 5        48000       24
## 6        75000       28
loanDataNoOutliers = loanData[!is.na(loanData$incomeAnnual) & !(2500000 < loanData$incomeAnnual), ]

isMissingInterestRate = is.na(loanDataNoOutliers$interestRate)
isMissingEmploymentYears = is.na(loanDataNoOutliers$employmentYears)

loanDataNoOutliersNA = loanDataNoOutliers

loanDataNoOutliersNA$interestRate[isMissingInterestRate] = median(loanDataNoOutliersNA$interestRate, na.rm = TRUE)
naInterestRate = as.integer(isMissingInterestRate)
loanDataNoOutliersNA = cbind(loanDataNoOutliersNA, naInterestRate)

loanDataNoOutliersNA$employmentYears[isMissingEmploymentYears] = median(loanDataNoOutliersNA$employmentYears, na.rm = TRUE)
naEmploymentYears = as.integer(isMissingEmploymentYears)
loanDataNoOutliersNA = cbind(loanDataNoOutliersNA, naEmploymentYears)

set.seed(2020)

loanTestIndices = sample(1:nrow(loanDataNoOutliersNA), nrow(loanDataNoOutliersNA)*1/3, replace = FALSE)
loanTest = loanDataNoOutliersNA[loanTestIndices, ]
loanTrain = loanDataNoOutliersNA[-loanTestIndices, ]
getGini = function(good, bad) {
   total = good+bad
   return(2*(bad/total) * (good/total))
}

defaults = sum(loanData$isLoanDefault)
non_defaults = sum(0 == loanData$isLoanDefault)

gini_root = getGini(non_defaults, defaults)
paste("The Gini of the root is:", gini_root)
## [1] "The Gini of the root is: 0.197239678524086"
# Left side counts
defaults_left = 1000
nondefaults_left = 8015
gini_left = getGini(nondefaults_left, defaults_left)

# Right side counts
defaults_right = 2227
nondefaults_right = 17850
gini_right = getGini(nondefaults_right, defaults_right)

# Total counts
total_left = defaults_left + nondefaults_left
total_right = defaults_right + nondefaults_right
total_all = total_left + total_right

# Gini gain
gini_gain = gini_root -
  (total_left/total_all * gini_left) -
  (total_right/total_all * gini_right)

gini_left
## [1] 0.1972432
gini_right
## [1] 0.1972381
gini_gain
## [1] 4.622219e-12
# The gini_gain is essentially zero, indicating no value in purchasing feature xyz. The near-zero gain means the feature does not improve the purity of the split, suggesting the data is already relatively balanced or the feature provides no useful discriminatory power.
library(rpart)
tree_defaults = rpart(isLoanDefault ~ ., method="class", data = loanTrain)
# plot(tree_defaults) - commented out; produces an error because the tree has no splits
# Error: fit is not a tree, just a root
# The decision tree produced no splits because the dataset is heavily imbalanced. With very few defaults relative to non-defaults, the gini gain from any split is near zero, so rpart creates only a root node with no branches.
# Select all rows where the loan defaulted
loanTrain_default = loanData[1 == loanData$isLoanDefault, ]

# Select all rows where the loan did not default
loanTrain_nodefault = loanData[0 == loanData$isLoanDefault, ]

# Randomly sample twice as many non-default cases as there are default cases to create a 2:1 ratio
loanTestIndices = sample(1:nrow(loanTrain_nodefault), 2*nrow(loanTrain_default), replace = FALSE)

# Keep only the sampled non-default rows
loanTrain_nodefault = loanTrain_nodefault[loanTestIndices,]

# Combine default and sampled non-default cases into one balanced training dataset
loanTrain_balanced = rbind(loanTrain_default, loanTrain_nodefault)
library(rpart)
tree_balanced = rpart(isLoanDefault ~ .,
                      data = loanTrain_balanced,
                      control = rpart.control(cp = 0.001))

plot(tree_balanced, uniform = TRUE)
text(tree_balanced)

# uniform = TRUE creates equal vertical spacing between branches, producing a cleaner and more readable decision tree layout.
tree_balanced = rpart(isLoanDefault ~ .,
                      data = loanTrain_balanced,
                      control = rpart.control(cp = 0.001),
                      parms = list(prior = c(0.7, 0.3)))

plot(tree_balanced, uniform = TRUE)
text(tree_balanced)

# M = 10 means a missed default (false negative) is penalized 10x more than a false positive
M = 10

tree_defaults_balanced_parms_lossmatrix = rpart(
  isLoanDefault ~ .,
  data = loanTrain_balanced,
  control = rpart.control(cp = 0.001),
  parms = list(loss = matrix(c(0, M, 1, 0), ncol = 2), prior = c(0.67, 0.33)))

plot(tree_balanced, uniform = TRUE)
text(tree_defaults_balanced_parms_lossmatrix, cex = 0.25)

# Prune using cp = 0.002 (midpoint of the 0.001 to 0.004 range)
tree_lovely = prune(tree_defaults_balanced_parms_lossmatrix, cp = 0.002)

plot(tree_lovely, uniform = TRUE)
text(tree_lovely, cex = 0.25)