230524 / BSA12. 트리 앙상블 (Tree Ensemble)

BSA08_Tree_Ensemble.ipynb

 

패키지 호출

import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
from statsmodels.graphics.mosaicplot import mosaic

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import pydotplus
from IPython.display import Image

 

데이터 불러오기 및 전처리

hmeq = pd.read_csv('hmeq.txt',sep='\t')

# 데이터 타입 확인
hmeq.dtypes

# 데이터 기술통계량 확인
hmeq.describe()

# 첫 10개 행 확인
hmeq.head(10)

# object type 변수(컬럼) 제거
hmeq = hmeq.drop(['REASON','JOB'], axis=1)

# 결측치 제거/대체
#hmeq = hmeq.dropna()
hmeq = hmeq.fillna(hmeq.median())
#hmeq = hmeq.fillna(value=999)
hmeq.describe()

 

설명변수, 반응변수 분리

modelfit_X = hmeq.iloc[:,1:]  # 설명변수
modelfit_X.head() 

modelfit_y = hmeq.iloc[:,0]  # 반응변수 (BAD)
modelfit_y.head()

 

학습데이터, 검증데이터 분리

X_train, X_test, y_train, y_test = train_test_split(modelfit_X, modelfit_y, test_size=0.4, random_state=0)
print(len(X_train), len(X_test))

Xname = ['LOAN','MORTDUE','VALUE','YOJ','DEROG','DELINQ','CLAGE','NINQ','CLNO','DEBTINC']
yname = ['good','bad']

 

분류 나무 생성

# default tree (maximum)
cart0 = tree.DecisionTreeClassifier(criterion='gini',random_state=0)
cart0.fit(X_train, y_train)
dot_data = export_graphviz(cart0, out_file=None, feature_names=Xname,
    class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# simple tree
cart1 = tree.DecisionTreeClassifier(criterion='gini',max_depth=2,random_state=0)
cart1.fit(X_train, y_train)
dot_data = export_graphviz(cart1, out_file=None, feature_names=Xname,
    class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# longer tree2
cart2 = tree.DecisionTreeClassifier(criterion='gini',min_impurity_decrease=0.01,min_samples_split=20,random_state=0)
cart2.fit(X_train, y_train)
dot_data = export_graphviz(cart2, out_file=None, feature_names=Xname,
    class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# longer tree3
cart3 = tree.DecisionTreeClassifier(criterion='gini',min_impurity_decrease=0.005,min_samples_split=20,random_state=0)
cart3.fit(X_train, y_train)
dot_data = export_graphviz(cart3, out_file=None, feature_names=Xname,
    class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())


# longer tree4
cart4 = tree.DecisionTreeClassifier(criterion='gini',min_impurity_decrease=0.001,min_samples_split=20,random_state=0)
cart4.fit(X_train, y_train)
dot_data = export_graphviz(cart4, out_file=None, feature_names=Xname,
    class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.set_size('"10,10"')
Image(graph.create_png())

 

성능 비교 (ROC Curve)

# Accuracy comparison of several trees
y_prob1 = cart1.predict_proba(X_test)[:,1]
y_prob2 = cart2.predict_proba(X_test)[:,1]
y_prob3 = cart3.predict_proba(X_test)[:,1]
y_prob4 = cart4.predict_proba(X_test)[:,1]
y_prob0 = cart0.predict_proba(X_test)[:,1]

# ROC Curve
roc_auc = roc_auc_score(y_test, y_prob1 )
fpr, tpr, thresholds = roc_curve(y_test, y_prob1)
roc_auc2 = roc_auc_score(y_test, y_prob2 )
fpr2, tpr2, thresholds2 = roc_curve(y_test, y_prob2)
roc_auc3 = roc_auc_score(y_test, y_prob3 )
fpr3, tpr3, thresholds3 = roc_curve(y_test, y_prob3)
roc_auc4 = roc_auc_score(y_test, y_prob4 )
fpr4, tpr4, thresholds4 = roc_curve(y_test, y_prob4)
roc_auc0 = roc_auc_score(y_test, y_prob0 )
fpr0, tpr0, thresholds0 = roc_curve(y_test, y_prob0)
plt.figure()
plt.plot(fpr, tpr, label='Tree 1(area = %0.2f)' % roc_auc)
plt.plot(fpr2, tpr2, label='Tree 2(area = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, label='Tree 3(area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='Tree 4(area = %0.2f)' % roc_auc4)
plt.plot(fpr0, tpr0, label='Tree 0(area = %0.2f)' % roc_auc0)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

 

1. Bagging

# Bagging
bagging = RandomForestClassifier(n_estimators=100, max_features=None, random_state=1234)
bagging.fit(X_train, y_train)

# Accuracy
bagging_pred = bagging.predict(X_test)
accuracy = accuracy_score(y_test, bagging_pred)
print(f'Mean accuracy score: {accuracy:.3}')

# Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, bagging_pred), columns=yname, index=yname)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

# Prediction probability
y_prob_bag = bagging.predict_proba(X_test)[:,1]
roc_auc_bag = roc_auc_score(y_test, y_prob_bag )
fpr_bag, tpr_bag, thresholds_bag = roc_curve(y_test, y_prob_bag)

 

2. Random Forest

# Random Forest 1
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', oob_score=True, random_state=1234)
rf.fit(X_train, y_train)

# Accuracy
rf_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, rf_pred)
print(f'Mean accuracy score: {accuracy:.3}')

# Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, rf_pred), columns=yname, index=yname)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

# Prediction probability
y_prob_rf = rf.predict_proba(X_test)[:,1]
roc_auc_rf = roc_auc_score(y_test, y_prob_rf )
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_prob_rf)
# Random Forest 2
# 10개의 설명변수 중 2개만 사용했을 때
rf1 = RandomForestClassifier(n_estimators=100, max_features=5, oob_score=True, random_state=1234)
rf1.fit(X_train, y_train)
rf1_pred = rf1.predict(X_test)
accuracy = accuracy_score(y_test, rf1_pred)
print(f'Mean accuracy score: {accuracy:.3}')

# Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, rf1_pred), columns=yname, index=yname)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

# Prediction probability
y_prob_rf1 = rf1.predict_proba(X_test)[:,1]
roc_auc_rf1 = roc_auc_score(y_test, y_prob_rf1 )
fpr_rf1, tpr_rf1, thresholds_rf1 = roc_curve(y_test, y_prob_rf1)

 

성능 비교 (ROC Curve)

# ROC Curve
plt.figure()
plt.plot(fpr3, tpr3, label='Tree 3(area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='Tree 4(area = %0.2f)' % roc_auc4)
plt.plot(fpr_bag, tpr_bag, label='Bagging (area = %0.2f)' % roc_auc_bag)
plt.plot(fpr_rf, tpr_rf, label='Random Forest (area = %0.2f)' % roc_auc_rf)
plt.plot(fpr_rf1, tpr_rf1, label='Random Forest 1(area = %0.2f)' % roc_auc_rf1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

 

feature importances

# Get numerical feature importances
importances = rf1.feature_importances_
for name, importance in zip(Xname, importances): print(name, "=", importance)
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)),X_train.columns[indices])
plt.xlabel('Relative Importance')
plt.show()