230508 / BSA10. python에서 스팸 메일 분류

BSA07_SMS-Spam-Analysis.ipynb

패키지 호출

import pandas as pd
import numpy as np
import nltk
import re  # regular expression operator(정규식)
from nltk.corpus import stopwords

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

불용어 다운로드

# nltk는 자체적으로 불용어가 있음
# 한 번만 실행시키면 됨
nltk.download('stopwords')

데이터 불러오기

# tab키로 구분(분리, seperator)되어 있음
# header가 없음
# read_table : 탭이나 space로 구분되어 있는 일반적인 경우에 데이터를 불러올 때 (read_csv로 불러오면 중간에 있는 콤마를 인식할 수도 있음) 중간에 있는 콤마를 인식할 수도 있음)
df = pd.read_table("SMSSpamCollection",header=None,sep="\t")
df.head()

데이터 전처리

# 변수명(Column) 변경
df = df.rename(columns={0: 'label',1: 'messages'})

# 결측자료 확인
df.isnull().sum()

불용어 제거

STOPWORDS = set(stopwords.words('english'))

def 불용어제거(텍스트):
    # 소문자로 변환
    텍스트 = 텍스트.lower()
    # 특수문자 제거
    # sub(대체) : 텍스트 안에 특수문자가 있으면 빈칸으로 만들어라
    텍스트 = re.sub(r'[^0-9a-zA-Z]', ' ', 텍스트)
    # 추가 빈 칸 제거
    # 텍스트에 빈칸이 2개 이상이면(위의 작업으로 생긴 공백 때문에) 1개로 만들어라
    텍스트 = re.sub(r'\s+', ' ', 텍스트)
    # remove stopwords
    # 텍스트.split() : 텍스트에 있는 문자를 공백을 기준으로 쪼개서 word로 저장하고
    # word가 STOPWORDS에 없으면 join을 하라 (STOPWORDS에 있으면 텍스트에 join하지 않고 넘어감)
    # 이 작업을 for문을 통해 반복함
    텍스트 = " ".join(word for word in 텍스트.split() if word not in STOPWORDS)
    return 텍스트


# messages에 불용어제거 적용
# 함수 적용
df['processed'] = df['messages'].apply(불용어제거)
df.head()

모델 적용

# 데이터 분할(processed => X, label => y)
X = df['processed']  # 설명변수
y = df['label']  # 반응변수


def SMS분류(model, X, y):
    # train test split
    # test_size=0.25 : test data(25%), train data(75%)
    # stratify=y : SPAN, HAM 모두 train data에 75%, test data에 25%가 되도록 층화
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=316, shuffle=True, stratify=y)
    
    # model training
    # CountVectorizer()에서 나온 결과를 가지고 TfidfTransformer() 작업을 하고
    # TfidfTransformer()에서 나온 결과를 가지고 model 모델에 적용함
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                               ('tfidf',TfidfTransformer()),
                               ('clf', model)])
    pipeline_model.fit(x_train, y_train)    
    print('Accuracy:', pipeline_model.score(x_test, y_test)*100)    
    
    # 예측값
    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test, y_pred))
    
    # cross-validation
    # 모든 데이터가 train data, test data로 사용됨
    cv_score = cross_val_score(pipeline_model, X, y, cv=5)
    print("CV Score:", np.mean(cv_score)*100)

# 1. Logistic regression
model = LogisticRegression()
SMS분류(model, X, y)

# 2. Naive Bayes
model = MultinomialNB()
SMS분류(model, X, y)

# 3. Support Vector Machine
model = SVC(C=3)  # 얼마나 정밀하게 할지 (default : C=1)
SMS분류(model, X, y)

# 4. Random Forest
model = RandomForestClassifier()
SMS분류(model, X, y)

모델 성능 평가

def SMS예측(model, x_train, y_train, x_test):
    pipeline_model = Pipeline([('vect', CountVectorizer()),
                               ('tfidf',TfidfTransformer()),
                               ('clf', model)])
    pipeline_model.fit(x_train, y_train)    
    y_pred = pipeline_model.predict(x_test)
    return y_pred
    
    
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=316, shuffle=True, stratify=y)
    
model = SVC(C=3)
예측label = SMS예측(model, x_train, y_train, x_test)
print(type(y_test),type(예측label))
print(list(y_test[:10]))
print(예측label[:10])

혼동행렬 = confusion_matrix(y_test, 예측label)
print(혼동행렬)

print("Precision:",precision_score(y_test, 예측label, pos_label="ham"))
print("Recall:",recall_score(y_test, 예측label, pos_label="ham"))
print("F1:",f1_score(y_test, 예측label, pos_label="ham"))

저작자표시 (새창열림)

'Statistics > BSA' 카테고리의 다른 글

230508 / BSA10. pyspark에서 스팸 메일 분류 (0)	2023.05.14
230508 / BSA10. pyspark의 Natural Language Processing (0)	2023.05.14
230503 / BSA10. 형태소 분석, 워드 클라우드 (0)	2023.05.09
230501 / BSA09. pyspark에서 탐색적 데이터 분석 (EDA) (0)	2023.05.03
230501 / BSA09. pandas에서 탐색적 데이터 분석 (EDA) (0)	2023.05.03

'Statistics > BSA' 카테고리의 다른 글

티스토리툴바