BSA08_Sklearn-ClassificationTree.ipynb
패키지 호출
from sklearn.datasets import load_iris
from sklearn import tree
# !pip install pydotplus
import pydotplus
from IPython.display import Image
## !pip install graphviz
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import graphviz
#from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split
데이터 불러오기
iris = load_iris()
분류 나무 생성
clf = tree.DecisionTreeClassifier().fit(iris.data, iris.target)
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=iris.feature_names,
class_names=iris.target_names,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
# graph.write_pdf("iris.pdf")
Image(graph.create_png())
학습데이터, 검증데이터 분리
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=100)
1. Bagging
m_bagging = BaggingClassifier()
m_bagging.fit(X_train,y_train)
pred_bagging = m_bagging.predict(X_test)
cf_bagging = confusion_matrix(y_test,pred_bagging)
print('Bagging Confusion Matrix')
print(cf_bagging)
score_bagging = m_bagging.score(X_test, y_test)
print('Bagging Accuracy')
print(score_bagging)
2. Boosting
m_boosting = GradientBoostingClassifier()
m_boosting.fit(X_train,y_train)
pred_boosting = m_boosting.predict(X_test)
cf_boosting = confusion_matrix(y_test,pred_boosting)
print('Boosting Confusion Matrix')
print(cf_boosting)
score_boosting = m_boosting.score(X_test, y_test)
print('Boosting Accuracy')
print(score_boosting)
3. Random Forest Classifier
m_RF = RandomForestClassifier(n_estimators=1000)
m_RF.fit(X_train,y_train)
pred_RF = m_RF.predict(X_test)
cf_RF = confusion_matrix(y_test,pred_RF)
print('Random Forest Confusion Matrix')
print(cf_RF)
score_RF = m_RF.score(X_test, y_test)
print('Random Forest Accuracy')
print(score_RF)
Random Forest의 feature importances
importances = m_RF.feature_importances_
std = np.std([m_RF.feature_importances_ for tree in m_RF.estimators_], axis=0)
변수명 = [f"feature {i}" for i in range(iris.data.shape[1])]
forest_importances = pd.Series(importances, index=변수명)
fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()
'Statistics > BSA' 카테고리의 다른 글
230529 / BSA13. k-means clustering (0) | 2023.05.29 |
---|---|
230524 / BSA12. 트리 앙상블 (Tree Ensemble) (0) | 2023.05.29 |
230522 / BSA12. pyspark에서 Logistic Regression (0) | 2023.05.24 |
230517 / BSA11. pyspark에서 통계 모델링 (0) | 2023.05.20 |
230517 / BSA11. python에서 통계 모델링 (0) | 2023.05.20 |