import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

bank_data = pd.read_csv("data/bank_data.csv")
bank_data.head()

bank_data = pd.read_csv("data/bank_data.csv", sep=";")
bank_data.head()

try:
    bank_data.drop(['day', 'month', 'duration', 'pdays', 'poutcome'], axis=1, inplace=True)
except NameError:
    print('The object `bank_data` does not exist!')

try:
    bank_data["y"] = bank_data['y'].apply(lambda y: 1 if y == 'yes' else 0)
    bank_data.head()
except NameError:
    print('The object `bank_data` does not exist!')

bank_data = pd.get_dummies(bank_data)
bank_data.head()

try:
    X = bank_data.drop('y', axis=1)
    Y = bank_data['y']
except NameError:
    print('The object `bank_data` does not exist!')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size=0.25)

def value_function(y_true, y_pred, tn_value=10, fp_value=-10, fn_value=-100, tp_value=100):
    sum_ = y_pred + y_true
    diff_ = y_pred - y_true
    tn_contrib = tn_value * np.mean((sum_ == 0) & (diff_ == 0))
    fp_contrib = fp_value * np.mean((sum_ == 1) & (diff_ == 1))
    fn_contrib = fn_value * np.mean((sum_ == 1) & (diff_ == -1))
    tp_contrib = tp_value * np.mean((sum_ == 2) & (diff_ == 0))
    return tn_contrib + fp_contrib + fn_contrib + tp_contrib

ada_boost = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(random_state=42, max_depth=5),
    n_estimators=2000,
    learning_rate=0.80,
    random_state=42)
ada_boost.fit(X_train, Y_train)

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=5,
                                                    random_state=42),
                   learning_rate=0.8, n_estimators=2000, random_state=42)

AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=5,
                                                    random_state=42),
                   learning_rate=0.8, n_estimators=2000, random_state=42)

DecisionTreeClassifier(max_depth=5, random_state=42)

DecisionTreeClassifier(max_depth=5, random_state=42)

try:
    tree = DecisionTreeClassifier(random_state=42).fit(X_train, Y_train)
except NameError:
    print('The objects `X_train, Y_train` do not exist!')

ada_value = value_function(Y_test.values, ada_boost.predict(X_test))
tree_value = value_function(Y_test.values, tree.predict(X_test))

print('AdaBoost value function:', ada_value)
print('Decision Tree value function:', tree_value)

# the decision tree scored higher so it looks like its doing better here than adaboost

AdaBoost value function: -0.7431655312748837
Decision Tree value function: 0.9926568167743075

def marketing_profits(model, X, Y, fp_value=-10, tp_value=100):
    tp_contrib = np.sum((model.predict(X) > 0) & (Y > 0)) * tp_value
    fp_contrib = np.sum((model.predict(X) > 0) & (Y < 1)) * fp_value
    return tp_contrib + fp_contrib

ada_prof = marketing_profits(ada_boost, X_test, Y_test)
tree_prof = marketing_profits(tree, X_test, Y_test)
percent_diff = ((ada_prof - tree_prof) / tree_prof) * 100
print('Percent change in profits using AdaBoost vs Decision Tree:', percent_diff, '%')

Percent change in profits using AdaBoost vs Decision Tree: -41.72692471288813 %

def _value_function(y, y_pred, **kwargs):
    return value_function(y, y_pred, **kwargs)


value_function_wrapper = make_scorer(_value_function)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# for my model i went with random forest since its basically just a lot of decision trees
# working together which should hopefully do better than one tree alone

# the data is imbalanced (only like 10% said yes) so im using class_weight balanced
# to make the model not just ignore the yes group

rf = RandomForestClassifier(class_weight='balanced', random_state=42)

# trying out a few different values for number of trees and max depth
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None]
}

# need to set up the value function as a scorer so GridSearchCV can use it
def _value_function(y, y_pred, **kwargs):
    return value_function(y, y_pred, **kwargs)

value_function_wrapper = make_scorer(_value_function)

# running grid search with 5 fold cv using our value function as the metric
# this way the model is tuned based on what actually matters for this problem
grid_search = GridSearchCV(rf, param_grid, scoring=value_function_wrapper, cv=5)
grid_search.fit(X_train, Y_train)

print('Best parameters:', grid_search.best_params_)
print('Best CV score:', grid_search.best_score_)

best_rf = grid_search.best_estimator_

# now comparing all 3 models on the test set with the value function
rf_val = value_function(Y_test.values, best_rf.predict(X_test))
ada_val = value_function(Y_test.values, ada_boost.predict(X_test))
tree_val = value_function(Y_test.values, tree.predict(X_test))

print('\nValue Function Results:')
print('AdaBoost:', ada_val)
print('Decision Tree:', tree_val)
print('Random Forest (tuned):', rf_val)

# and now checking the marketing profits for each model
rf_prof = marketing_profits(best_rf, X_test, Y_test)
ada_prof = marketing_profits(ada_boost, X_test, Y_test)
tree_prof = marketing_profits(tree, X_test, Y_test)

print('\nMarketing Profits:')
print('AdaBoost:', ada_prof)
print('Decision Tree:', tree_prof)
print('Random Forest (tuned):', rf_prof)

# percent change vs decision tree
rf_vs_tree = ((rf_prof - tree_prof) / tree_prof) * 100
print('\nRF vs Decision Tree profit change:', round(rf_vs_tree, 2), '%')

# percent change vs adaboost
rf_vs_ada = ((rf_prof - ada_prof) / ada_prof) * 100
print('RF vs AdaBoost profit change:', round(rf_vs_ada, 2), '%')

Best parameters: {'max_depth': 5, 'n_estimators': 200}
Best CV score: 7.150506182578337

Value Function Results:
AdaBoost: -0.7431655312748837
Decision Tree: 0.9926568167743075
Random Forest (tuned): 6.952136600902416

Marketing Profits:
AdaBoost: 13700
Decision Tree: 23510
Random Forest (tuned): 57190

RF vs Decision Tree profit change: 143.26 %
RF vs AdaBoost profit change: 317.45 %

	age;"job";"marital";"education";"default";"balance";"housing";"loan";"contact";"day";"month";"duration";"campaign";"pdays";"previous";"poutcome";"y"
0	58;"management";"married";"tertiary";"no";2143...
1	44;"technician";"single";"secondary";"no";29;"...
2	33;"entrepreneur";"married";"secondary";"no";2...
3	47;"blue-collar";"married";"unknown";"no";1506...
4	33;"unknown";"single";"unknown";"no";1;"no";"n...

	age	job	marital	education	default	balance	housing	loan	contact	day	month	duration	campaign	pdays	poutcome	y
0	58	management	married	tertiary	no	2143	yes	no	unknown	5	may	261	1	-1	unknown	no
1	44	technician	single	secondary	no	29	yes	no	unknown	5	may	151	1	-1	unknown	no
2	33	entrepreneur	married	secondary	no	2	yes	yes	unknown	5	may	76	1	-1	unknown	no
3	47	blue-collar	married	unknown	no	1506	yes	no	unknown	5	may	92	1	-1	unknown	no
4	33	unknown	single	unknown	no	1	no	no	unknown	5	may	198	1	-1	unknown	no

	age	balance	campaign	job_admin.	job_blue-collar	job_entrepreneur	job_housemaid	job_management	...	education_unknown	default_no	default_yes	housing_no	housing_yes	loan_no	loan_yes	contact_cellular	contact_telephone	contact_unknown
0	58	2143	1	False	False	False	False	True	...	False	True	False	False	True	True	False	False	False	True
1	44	29	1	False	False	False	False	False	...	False	True	False	False	True	True	False	False	False	True
2	33	2	1	False	False	True	False	False	...	False	True	False	False	True	False	True	False	False	True
3	47	1506	1	False	True	False	False	False	...	True	True	False	False	True	True	False	False	False	True
4	33	1	1	False	False	False	False	False	...	True	True	False	True	False	True	False	False	False	True

AdaBoost Ensemble for Bank Marketing Campaign ROI¶