BSA08-Regress-Whitewine.ipynb 패키지 호출 import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.linear_model import Lasso, Ridge from sklearn.metrics import mean_squared_error from sklearn.metrics import confusion_matrix, accuracy_score from sklearn.linear_model import LogisticRegression from sklearn import datasets impor..
BSA07_Python_SMS-Spam.ipynb 1. CountVectorizer 예제문장 = ['This is the first document.', 'This document is the second document.', 'And this is the third one.', 'Is this the first document?'] # 단어 빈도 계산 vectorizer = CountVectorizer() 토큰개수 = vectorizer.fit_transform(예제문장) vectorizer.get_feature_names_out() # 출력결과 # array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this'], dty..
BSA07_ NLP-1.ipynb 패키지 호출 import re import nltk from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize from nltk.tokenize import RegexpTokenizer from nltk import ngrams 토큰화 빅통분 = """이 강의에서는 Windows 10, 11 하에서 Apache에서 제공하는 다양한 빅데이터처리 툴(Hadoop, Spark, Kafka 등)을 설치하고 환경 설정하는 작업을 직접 수행하기 때문에 컴퓨터에 대한 사전 지식이 없는 경우 수강하기 어려움. 빅데이터의 특징인 3V와 같이 이 교과에서는 상당한 양과 다양한 내용을 빠르게 강의하기 때문에..
BSA07_Pyspark-SMS-Spam-Analysis.ipynb 패키지 호출 from pyspark.sql import SparkSession from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.ml.feature import CountVectorizer from pyspark.sql.functions import col, udf from pyspark.sql.types import IntegerType from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import IDF from pyspark.sql.functions import length fro..
BSA07_Pyspark-NLP.ipynb 패키지 호출 및 스파크 세션 시작 from pyspark.sql import SparkSession from pyspark.ml.feature import Tokenizer, RegexTokenizer # RegexTokenizer : 데이터를 쪼개는 작업을 위한 것 from pyspark.ml.feature import StopWordsRemover from pyspark.ml.feature import NGram from pyspark.ml.feature import HashingTF, IDF, Tokenizer from pyspark.ml.feature import CountVectorizer spark = SparkSession.builder.appNam..
BSA07_SMS-Spam-Analysis.ipynb 패키지 호출 import pandas as pd import numpy as np import nltk import re # regular expression operator(정규식) from nltk.corpus import stopwords from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import classification_report from sklearn.feature_extraction.text import CountVectorizer, TfidfVectori..