머신러닝 24일차

ParkJinYoung·2022년 11월 18일

Embedding

# X_train에 들어 있는 숫자가 어떤 단어를 표시하는지 확인
word_index = reuters.get_word_index()
index_to_word={}
for key, value in word_index.items():
    index_to_word[value] = key
    
# y데이터 원핫인코딩
from tensorflow.keras.utils import to_categorical

y_train_en = to_categorical(y_train)
y_test_en = to_categorical(y_test)

# 학습을 위해서 feature추가
X_train_seq = X_train_seq.reshape(8982,145,1)
X_test_seq = X_test_seq.reshape(2246,145,1)

# 딥러닝 모델 설계
# RNN층만 사용
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense , SimpleRNN, Embedding

model1 = Sequential()
model1.add(SimpleRNN(units = 32, input_shape = (145,1),activation = 'tanh'))

model1.add(Dense(units=64, activation = 'relu'))
model1.add(Dense(units=46, activation = 'softmax'))
model1.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model1.fit(X_train_seq,y_train_en, epochs = 50)

(X_train,y_train),(X_test,y_test) = reuters.load_data(num_words = max_feature)
max_len = 145
from tensorflow.keras.preprocessing import sequence
X_train_seq = sequence.pad_sequences(X_train, maxlen = max_len)
X_test_seq = sequence.pad_sequences(X_test, maxlen = max_len)

# RNN + Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense , SimpleRNN, Embedding

model2 = Sequential()
model2.add(Embedding(2000,100)) # 사용하는 단어수, 각 단어가 표현될 숫자
model2.add(SimpleRNN(units = 32,activation = 'tanh'))

model2.add(Dense(units=64, activation = 'relu'))
model2.add(Dense(units=46, activation = 'softmax'))

model2.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model2.fit(X_train_seq,y_train_en, epochs = 20)

# LSTM + Embedding
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense , LSTM, Embedding

model3 = Sequential()
model3.add(Embedding(2000,100)) # 사용하는 단어수, 각 단어가 표현될 숫자
model3.add(LSTM(units = 32,activation = 'tanh'))

model3.add(Dense(units=64, activation = 'relu'))
model3.add(Dense(units=46, activation = 'softmax'))

model3.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model3.fit(X_train_seq,y_train_en, epochs = 20)

# LSTM + Embedding + Conv
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense , LSTM, Embedding,Conv1D, MaxPooling1D

model4 = Sequential()
model4.add(Embedding(2000,100)) # 사용하는 단어수, 각 단어가 표현될 숫자
model4.add(Conv1D(filters = 32,kernel_size = 5, activation = 'relu'))
model4.add(MaxPooling1D(pool_size = 4))
model4.add(LSTM(units = 32,activation = 'tanh'))


model4.add(Dense(units=64, activation = 'relu'))
model4.add(Dense(units=46, activation = 'softmax'))

model4.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

model4.fit(X_train_seq,y_train_en, epochs = 20)

텍스트 생성

text="""경마장에 있는 말이 뛰고 있다\n
그의 말이 법이다\n
가는 말이 고와야 오는 말이 곱다"""

# 단어 분리
from tensorflow.keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts([text])
t.word_docs
t.word_index

# 훈련 데이터 생성
sequences = []
for line in text.split('\n'):
  # 단어를 정수로 변환  
  encoded=t.texts_to_sequences([line])[0]

  # 2개 이상의 조합 생성
  for i in range(1,len(encoded)):
    sequence = encoded[:i+1]
    sequences.append(sequence)
sequences

from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = 6

sequences = pad_sequences(sequences,maxlen = max_len)
sequences

import numpy as np
sequences = np.array(sequences)
X = sequences[:,:-1]
y = sequences[:,-1]

# 원핫인코딩
from tensorflow.keras.utils import to_categorical

# 클래스 + 1
y_en = to_categorical(y,num_classes = 12)
y_en

from tensorflow.keras.layers import Embedding, Dense, SimpleRNN
from tensorflow.keras import Sequential

model1 = Sequential()

model1.add(Embedding(12,10,input_length=5))
model1.add(SimpleRNN(32))
model1.add(Dense(units = 12, activation = 'softmax'))
model1.compile(
    loss = 'categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']

)
model1.fit(X,y_en,epochs = 200)

# 학습 모델 사용해서 텍스트 생성기능 만들기
# 사용할 모델, 토크나이저, 입력 단어, 생성할 단어 수
def make_sentence(model,t,current_word,n):
  init_word = current_word # 마지막에 같이 출력하기 위함
  sentence = ''
  for _ in range(n):
    encoded = t.texts_to_sequences([current_word])[0]
    print(encoded) # 번호 확인
    print(current_word)
    # 데이터를 같은 길이(5)로 맞춰주기
    encoded = pad_sequences([encoded],maxlen=max_len -1)
    # 입력한 단어로 예측하기
    result = model.predict(encoded, verbose = 0).argmax(axis = -1)

    for word, index in t.word_index.items():
      if index == result:
        break

      #현재 단어 +""+ 예측단어를 현재 단어로 변경
    current_word = current_word +" "+word
    sentence = sentence + " "+ word

  # 문장 생성
  sentence = init_word + sentence
  return sentence  
  
# 임의 단어로 설정 갯수만큼의 단어 문장을 생성
print(make_sentence(model1,t,"경마장에",4))

ParkJinYoung

꾸준히

이전 포스트

머신러닝 24일차

Embedding

텍스트 생성

머신러닝 23일차

0개의 댓글