BSA08_Tree_Ensemble.ipynb
패키지 호출
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from statsmodels.graphics.mosaicplot import mosaic
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import export_graphviz
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import pydotplus
from IPython.display import Image
데이터 불러오기 및 전처리
hmeq = pd.read_csv('hmeq.txt',sep='\t')
# 데이터 타입 확인
hmeq.dtypes
# 데이터 기술통계량 확인
hmeq.describe()
# 첫 10개 행 확인
hmeq.head(10)
# object type 변수(컬럼) 제거
hmeq = hmeq.drop(['REASON','JOB'], axis=1)
# 결측치 제거/대체
#hmeq = hmeq.dropna()
hmeq = hmeq.fillna(hmeq.median())
#hmeq = hmeq.fillna(value=999)
hmeq.describe()
설명변수, 반응변수 분리
modelfit_X = hmeq.iloc[:,1:] # 설명변수
modelfit_X.head()
modelfit_y = hmeq.iloc[:,0] # 반응변수 (BAD)
modelfit_y.head()
학습데이터, 검증데이터 분리
X_train, X_test, y_train, y_test = train_test_split(modelfit_X, modelfit_y, test_size=0.4, random_state=0)
print(len(X_train), len(X_test))
Xname = ['LOAN','MORTDUE','VALUE','YOJ','DEROG','DELINQ','CLAGE','NINQ','CLNO','DEBTINC']
yname = ['good','bad']
분류 나무 생성
# default tree (maximum)
cart0 = tree.DecisionTreeClassifier(criterion='gini',random_state=0)
cart0.fit(X_train, y_train)
dot_data = export_graphviz(cart0, out_file=None, feature_names=Xname,
class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# simple tree
cart1 = tree.DecisionTreeClassifier(criterion='gini',max_depth=2,random_state=0)
cart1.fit(X_train, y_train)
dot_data = export_graphviz(cart1, out_file=None, feature_names=Xname,
class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# longer tree2
cart2 = tree.DecisionTreeClassifier(criterion='gini',min_impurity_decrease=0.01,min_samples_split=20,random_state=0)
cart2.fit(X_train, y_train)
dot_data = export_graphviz(cart2, out_file=None, feature_names=Xname,
class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# longer tree3
cart3 = tree.DecisionTreeClassifier(criterion='gini',min_impurity_decrease=0.005,min_samples_split=20,random_state=0)
cart3.fit(X_train, y_train)
dot_data = export_graphviz(cart3, out_file=None, feature_names=Xname,
class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())
# longer tree4
cart4 = tree.DecisionTreeClassifier(criterion='gini',min_impurity_decrease=0.001,min_samples_split=20,random_state=0)
cart4.fit(X_train, y_train)
dot_data = export_graphviz(cart4, out_file=None, feature_names=Xname,
class_names=yname,filled=True, rounded=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.set_size('"10,10"')
Image(graph.create_png())
성능 비교 (ROC Curve)
# Accuracy comparison of several trees
y_prob1 = cart1.predict_proba(X_test)[:,1]
y_prob2 = cart2.predict_proba(X_test)[:,1]
y_prob3 = cart3.predict_proba(X_test)[:,1]
y_prob4 = cart4.predict_proba(X_test)[:,1]
y_prob0 = cart0.predict_proba(X_test)[:,1]
# ROC Curve
roc_auc = roc_auc_score(y_test, y_prob1 )
fpr, tpr, thresholds = roc_curve(y_test, y_prob1)
roc_auc2 = roc_auc_score(y_test, y_prob2 )
fpr2, tpr2, thresholds2 = roc_curve(y_test, y_prob2)
roc_auc3 = roc_auc_score(y_test, y_prob3 )
fpr3, tpr3, thresholds3 = roc_curve(y_test, y_prob3)
roc_auc4 = roc_auc_score(y_test, y_prob4 )
fpr4, tpr4, thresholds4 = roc_curve(y_test, y_prob4)
roc_auc0 = roc_auc_score(y_test, y_prob0 )
fpr0, tpr0, thresholds0 = roc_curve(y_test, y_prob0)
plt.figure()
plt.plot(fpr, tpr, label='Tree 1(area = %0.2f)' % roc_auc)
plt.plot(fpr2, tpr2, label='Tree 2(area = %0.2f)' % roc_auc2)
plt.plot(fpr3, tpr3, label='Tree 3(area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='Tree 4(area = %0.2f)' % roc_auc4)
plt.plot(fpr0, tpr0, label='Tree 0(area = %0.2f)' % roc_auc0)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
1. Bagging
# Bagging
bagging = RandomForestClassifier(n_estimators=100, max_features=None, random_state=1234)
bagging.fit(X_train, y_train)
# Accuracy
bagging_pred = bagging.predict(X_test)
accuracy = accuracy_score(y_test, bagging_pred)
print(f'Mean accuracy score: {accuracy:.3}')
# Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, bagging_pred), columns=yname, index=yname)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
# Prediction probability
y_prob_bag = bagging.predict_proba(X_test)[:,1]
roc_auc_bag = roc_auc_score(y_test, y_prob_bag )
fpr_bag, tpr_bag, thresholds_bag = roc_curve(y_test, y_prob_bag)
2. Random Forest
# Random Forest 1
rf = RandomForestClassifier(n_estimators=1000, max_features='sqrt', oob_score=True, random_state=1234)
rf.fit(X_train, y_train)
# Accuracy
rf_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, rf_pred)
print(f'Mean accuracy score: {accuracy:.3}')
# Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, rf_pred), columns=yname, index=yname)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
# Prediction probability
y_prob_rf = rf.predict_proba(X_test)[:,1]
roc_auc_rf = roc_auc_score(y_test, y_prob_rf )
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, y_prob_rf)
# Random Forest 2
# 10개의 설명변수 중 2개만 사용했을 때
rf1 = RandomForestClassifier(n_estimators=100, max_features=5, oob_score=True, random_state=1234)
rf1.fit(X_train, y_train)
rf1_pred = rf1.predict(X_test)
accuracy = accuracy_score(y_test, rf1_pred)
print(f'Mean accuracy score: {accuracy:.3}')
# Confusion Matrix
cm = pd.DataFrame(confusion_matrix(y_test, rf1_pred), columns=yname, index=yname)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
# Prediction probability
y_prob_rf1 = rf1.predict_proba(X_test)[:,1]
roc_auc_rf1 = roc_auc_score(y_test, y_prob_rf1 )
fpr_rf1, tpr_rf1, thresholds_rf1 = roc_curve(y_test, y_prob_rf1)
성능 비교 (ROC Curve)
# ROC Curve
plt.figure()
plt.plot(fpr3, tpr3, label='Tree 3(area = %0.2f)' % roc_auc3)
plt.plot(fpr4, tpr4, label='Tree 4(area = %0.2f)' % roc_auc4)
plt.plot(fpr_bag, tpr_bag, label='Bagging (area = %0.2f)' % roc_auc_bag)
plt.plot(fpr_rf, tpr_rf, label='Random Forest (area = %0.2f)' % roc_auc_rf)
plt.plot(fpr_rf1, tpr_rf1, label='Random Forest 1(area = %0.2f)' % roc_auc_rf1)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()
feature importances
# Get numerical feature importances
importances = rf1.feature_importances_
for name, importance in zip(Xname, importances): print(name, "=", importance)
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)),X_train.columns[indices])
plt.xlabel('Relative Importance')
plt.show()
'Statistics > BSA' 카테고리의 다른 글
230529 / BSA13. SVM Classification (0) | 2023.05.29 |
---|---|
230529 / BSA13. k-means clustering (0) | 2023.05.29 |
230524 / BSA12. 분류 나무 (Classification Tree) (0) | 2023.05.29 |
230522 / BSA12. pyspark에서 Logistic Regression (0) | 2023.05.24 |
230517 / BSA11. pyspark에서 통계 모델링 (0) | 2023.05.20 |