텍스트분류

Jane의 study note.·2022년 11월 30일
NLP 자연어처리

목록 보기
10/24
[16-1] 스팸 햄 분류 실습

!pip install np #환경설정

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
import np #pip install np

#환경설정
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

"""
실행시 주의사항: SMSSpamCollection의 경로를 지정해주세요.
"""

smsdata = open('SMSSpamCollection',encoding='utf8') #PATH SETTING


def preprocessing(text):   #Preprocessing
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] 
   
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    
    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]

    preprocessed_text= ' '.join(tokens)
    return preprocessed_text



sms_data = []
sms_labels = []
cnt = 0
sencsv_reader = csv.reader(smsdata,delimiter='\t')
for line in sencsv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    sms_data.append(preprocessing(line[1]))

smsdata.close()


trainset_size = int(round(len(sms_data)*0.70))  #Split Train data and Test data
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0:trainset_size]])
y_train = np.array([el for el in sms_labels[0:trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array(([el for el in sms_labels[trainset_size+1:len(sms_labels)]]) or el in sms_labels[trainset_size+1:len(sms_labels)])

# TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer2 = TfidfVectorizer(min_df=2, ngram_range=(1, 2), stop_words='english', strip_accents='unicode', norm='l2')
X_train = vectorizer2.fit_transform(x_train)
X_test = vectorizer2.transform(x_test)

# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
clf_NB = MultinomialNB().fit(X_train, y_train)
y_predicted_NB = clf_NB.predict(X_test)

# Decision tree
from sklearn import tree
clf_DT = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_predicted_DT = clf_DT.predict(X_test.toarray())

# Stochastic gradient descent
from sklearn.linear_model import SGDClassifier
#clf_SGD = SGDClassifier(alpha=.0001, n_iter=50).fit(X_train, y_train)
clf_SGD = SGDClassifier(alpha=.0001).fit(X_train, y_train)
y_predicted_SGD = clf_SGD.predict(X_test)

# Support Vector Machines
from sklearn.svm import LinearSVC
clf_SVM = LinearSVC().fit(X_train, y_train)
y_predicted_SVM = clf_SVM.predict(X_test)

# The Random forest algorithm
from sklearn.ensemble import RandomForestClassifier
clf_RFA = RandomForestClassifier(n_estimators=10)
clf_RFA.fit(X_train, y_train)
y_predicted_RFA = clf_RFA.predict(X_test)




from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix NB \n ')
cm = confusion_matrix(y_test, y_predicted_NB)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_NB))


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix DT \n ')
cm = confusion_matrix(y_test, y_predicted_DT)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_DT))

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix SGD \n ')
cm = confusion_matrix(y_test, y_predicted_SGD)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SGD))

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix SVM\n ')
cm = confusion_matrix(y_test, y_predicted_SVM)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_SVM))

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
print (' \n confusion_matrix RFA \n ')
cm = confusion_matrix(y_test, y_predicted_RFA)
print (cm)
print ('\n Here is the classification report:')
print (classification_report(y_test, y_predicted_RFA))

Requirement already satisfied: np in /usr/local/lib/python3.6/dist-packages (1.0.2)
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
The training set size for this classifier is 3900

 
 confusion_matrix NB 
 
[[1443    0]
 [  52  176]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1443
        spam       1.00      0.77      0.87       228

    accuracy                           0.97      1671
   macro avg       0.98      0.89      0.93      1671
weighted avg       0.97      0.97      0.97      1671

 
 confusion_matrix DT 
 
[[1411   32]
 [  39  189]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.97      0.98      0.98      1443
        spam       0.86      0.83      0.84       228

    accuracy                           0.96      1671
   macro avg       0.91      0.90      0.91      1671
weighted avg       0.96      0.96      0.96      1671

 
 confusion_matrix SGD 
 
[[1436    7]
 [  22  206]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1443
        spam       0.97      0.90      0.93       228

    accuracy                           0.98      1671
   macro avg       0.98      0.95      0.96      1671
weighted avg       0.98      0.98      0.98      1671

 
 confusion_matrix SVM
 
[[1437    6]
 [  24  204]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1443
        spam       0.97      0.89      0.93       228

    accuracy                           0.98      1671
   macro avg       0.98      0.95      0.96      1671
weighted avg       0.98      0.98      0.98      1671

 
 confusion_matrix RFA 
 
[[1441    2]
 [  54  174]]

 Here is the classification report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1443
        spam       0.99      0.76      0.86       228

    accuracy                           0.97      1671
   macro avg       0.98      0.88      0.92      1671
weighted avg       0.97      0.97      0.96      1671
Jane의 study note.
이전 포스트
Text summarization

다음 포스트
텍스트분류

NLP 자연어처리

[16-1] 스팸 햄 분류 실습

Text summarization

단어 임베딩

0개의 댓글