텍스트분류

Jane의 study note.·2022년 11월 30일
0

NLP 자연어처리

목록 보기
10/24

[16-1] 스팸 햄 분류 실습

!pip install np #환경설정

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import np #pip install np

#환경설정
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

"""
실행시 주의사항: SMSSpamCollection의 경로를 지정해주세요.
"""

smsdata = open('SMSSpamCollection',encoding='utf8') #PATH SETTING


def preprocessing(text):   #Preprocessing
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] 
   
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    
    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    preprocessed_text= ' '.join(tokens)
    return preprocessed_text



sms_data = []
sms_labels = []
cnt = 0
sencsv_reader = csv.reader(smsdata,delimiter='\t')
for line in sencsv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    sms_data.append(preprocessing(line[1]))

smsdata.close()


trainset_size = int(round(len(sms_data)*0.70))  #Split Train data and Test data
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array(([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) or el in sms_labels[trainset_size+1:len(sms_labels)])

# TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2')
X_train = vectorizer2.fit_transform(x_train)
X_test = vectorizer2.transform(x_test)

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train, y_train)
y_predicted_NB = clf_NB.predict(X_test)

# Decision tree
from sklearn import tree
clf_DT = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_predicted_DT = clf_DT.predict(X_test.toarray())

# Stochastic gradient descent
from sklearn.linear_model import SGDClassifier
#clf_SGD = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
clf_SGD = SGDClassifier(alpha=.0001).fit(X_train, y_train)
y_predicted_SGD = clf_SGD.predict(X_test)

# Support Vector Machines
from sklearn.svm import LinearSVC
clf_SVM = LinearSVC().fit(X_train, y_train)
y_predicted_SVM = clf_SVM.predict(X_test)

# The Random forest algorithm
from sklearn.ensemble import RandomForestClassifier
clf_RFA = RandomForestClassifier(n_estimators=10)
clf_RFA.fit(X_train, y_train)
y_predicted_RFA = clf_RFA.predict(X_test)




from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix NB \n ')
cm = confusion_matrix(y_test, y_predicted_NB)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_NB))


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix DT \n ')
cm = confusion_matrix(y_test, y_predicted_DT)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_DT))

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix SGD \n ')
cm = confusion_matrix(y_test, y_predicted_SGD)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SGD))

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix SVM\n ')
cm = confusion_matrix(y_test, y_predicted_SVM)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SVM))

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix RFA \n ')
cm = confusion_matrix(y_test, y_predicted_RFA)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_RFA))

Requirement already satisfied: np in /usr/local/lib/python3.6/dist-packages (1.0.2)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
The training set size for this classifier is 3900

 
 confusion_matrix NB 
 
[[1443    0]
 [  52  176]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1443
        spam       1.00      0.77      0.87       228

    accuracy                           0.97      1671
   macro avg       0.98      0.89      0.93      1671
weighted avg       0.97      0.97      0.97      1671

 
 confusion_matrix DT 
 
[[1411   32]
 [  39  189]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.97      0.98      0.98      1443
        spam       0.86      0.83      0.84       228

    accuracy                           0.96      1671
   macro avg       0.91      0.90      0.91      1671
weighted avg       0.96      0.96      0.96      1671

 
 confusion_matrix SGD 
 
[[1436    7]
 [  22  206]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1443
        spam       0.97      0.90      0.93       228

    accuracy                           0.98      1671
   macro avg       0.98      0.95      0.96      1671
weighted avg       0.98      0.98      0.98      1671

 
 confusion_matrix SVM
 
[[1437    6]
 [  24  204]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1443
        spam       0.97      0.89      0.93       228

    accuracy                           0.98      1671
   macro avg       0.98      0.95      0.96      1671
weighted avg       0.98      0.98      0.98      1671

 
 confusion_matrix RFA 
 
[[1441    2]
 [  54  174]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1443
        spam       0.99      0.76      0.86       228

    accuracy                           0.97      1671
   macro avg       0.98      0.88      0.92      1671
weighted avg       0.97      0.97      0.96      1671

0개의 댓글