230529 / BSA13. k-means clustering

BSA08_Kmean-Cluster.ipynb

 

패키지 호출

import numpy as np
import pandas as pd

from sklearn.preprocessing import scale
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.mixture import GaussianMixture

 

데이터 불러오기

iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=["sepal_length","sepal_width","petal_length","petal_width"])
iris_df.head()

 

1. k-means clustering

kmeans = KMeans(n_clusters=3, max_iter=300, random_state=316)
kmeans.fit(iris_df)

# 각 데이터가 몇 번째 그룹(0, 1, 2)으로 배치되었는지 확인
print(kmeans.labels_)

iris_df["target"] = iris.target
iris_df["cluster"] = kmeans.labels_
iris_result = iris_df.groupby(["target","cluster"])["sepal_length"].count()
# 얼마나 잘 군집을 형성했는지 확인
print(iris_result)

 

군집화 평가 - 실루엣 분석(Silhouette analysis)

iris_df["silhouette"] = silhouette_samples(iris.data, iris_df["cluster"])

ave_score = silhouette_score(iris.data,iris_df["cluster"])
print("SAS:{0:.4f}".format(ave_score))
iris_df.head()
iris_df.groupby("cluster")["silhouette"].mean()  # 군집별 실루엣 계수 계산

 

 

2. GMM (Gaussian Mixture Model)

gmm = GaussianMixture(n_components=3, random_state=316).fit(iris.data)
iris_df["gmm"] = gmm.predict(iris.data)
iris_df.head()

iris_result = iris_df.groupby(["target"])["gmm"].value_counts()
print(iris_result)

 

3. 기타 군집법

  • 평균이동(mean shift) : from sklearn.cluster import MeanShift
  • DBSCAN(density based spatial clustering of applications with noise): from sklearn.cluster import DBSCAN