[16-1] 스팸 햄 분류 실습
!pip install np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import np
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
"""
실행시 주의사항: SMSSpamCollection의 경로를 지정해주세요.
"""
smsdata = open('SMSSpamCollection',encoding='utf8')
def preprocessing(text):
tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
stop = stopwords.words('english')
tokens = [token for token in tokens if token not in stop]
tokens = [word for word in tokens if len(word) >= 3]
tokens = [word.lower() for word in tokens]
lmtzr = WordNetLemmatizer()
tokens = [lmtzr.lemmatize(word) for word in tokens]
preprocessed_text= ' '.join(tokens)
return preprocessed_text
sms_data = []
sms_labels = []
cnt = 0
sencsv_reader = csv.reader(smsdata,delimiter='\t')
for line in sencsv_reader:
sms_labels.append(line[0])
sms_data.append(preprocessing(line[1]))
smsdata.close()
trainset_size = int(round(len(sms_data)*0.70))
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array(([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) or el in sms_labels[trainset_size+1:len(sms_labels)])
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2')
X_train = vectorizer2.fit_transform(x_train)
X_test = vectorizer2.transform(x_test)
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train, y_train)
y_predicted_NB = clf_NB.predict(X_test)
from sklearn import tree
clf_DT = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_predicted_DT = clf_DT.predict(X_test.toarray())
from sklearn.linear_model import SGDClassifier
clf_SGD = SGDClassifier(alpha=.0001).fit(X_train, y_train)
y_predicted_SGD = clf_SGD.predict(X_test)
from sklearn.svm import LinearSVC
clf_SVM = LinearSVC().fit(X_train, y_train)
y_predicted_SVM = clf_SVM.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
clf_RFA = RandomForestClassifier(n_estimators=10)
clf_RFA.fit(X_train, y_train)
y_predicted_RFA = clf_RFA.predict(X_test)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix NB \n ')
cm = confusion_matrix(y_test, y_predicted_NB)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_NB))
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix DT \n ')
cm = confusion_matrix(y_test, y_predicted_DT)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_DT))
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix SGD \n ')
cm = confusion_matrix(y_test, y_predicted_SGD)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SGD))
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix SVM\n ')
cm = confusion_matrix(y_test, y_predicted_SVM)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SVM))
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix RFA \n ')
cm = confusion_matrix(y_test, y_predicted_RFA)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_RFA))
Requirement already satisfied: np in /usr/local/lib/python3.6/dist-packages (1.0.2)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
The training set size for this classifier is 3900
confusion_matrix NB
[[1443 0]
[ 52 176]]
Here is the classification report:
precision recall f1-score support
ham 0.97 1.00 0.98 1443
spam 1.00 0.77 0.87 228
accuracy 0.97 1671
macro avg 0.98 0.89 0.93 1671
weighted avg 0.97 0.97 0.97 1671
confusion_matrix DT
[[1411 32]
[ 39 189]]
Here is the classification report:
precision recall f1-score support
ham 0.97 0.98 0.98 1443
spam 0.86 0.83 0.84 228
accuracy 0.96 1671
macro avg 0.91 0.90 0.91 1671
weighted avg 0.96 0.96 0.96 1671
confusion_matrix SGD
[[1436 7]
[ 22 206]]
Here is the classification report:
precision recall f1-score support
ham 0.98 1.00 0.99 1443
spam 0.97 0.90 0.93 228
accuracy 0.98 1671
macro avg 0.98 0.95 0.96 1671
weighted avg 0.98 0.98 0.98 1671
confusion_matrix SVM
[[1437 6]
[ 24 204]]
Here is the classification report:
precision recall f1-score support
ham 0.98 1.00 0.99 1443
spam 0.97 0.89 0.93 228
accuracy 0.98 1671
macro avg 0.98 0.95 0.96 1671
weighted avg 0.98 0.98 0.98 1671
confusion_matrix RFA
[[1441 2]
[ 54 174]]
Here is the classification report:
precision recall f1-score support
ham 0.96 1.00 0.98 1443
spam 0.99 0.76 0.86 228
accuracy 0.97 1671
macro avg 0.98 0.88 0.92 1671
weighted avg 0.97 0.97 0.96 1671