แ„‚ ๐Ÿ˜„ [18 ์ผ์ฐจ] : Assignment - EX04 ์ธ๊ณต์ง€๋Šฅ ์ž‘์‚ฌ๊ฐ€

๋ฐฑ๊ฑดยท2022๋…„ 1์›” 21์ผ
0

๋ณธ ๊ธ€์€ Hierachical Structure์˜ ๊ธ€์“ฐ๊ธฐ ๋ฐฉ์‹์œผ๋กœ, ๊ธ€์˜ ์ „์ฒด์ ์ธ ๋งฅ๋ฝ์„ ํŒŒ์•…ํ•˜๊ธฐ ์‰ฝ๋„๋ก ์ž‘์„ฑ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.
๋˜ํ•œ ๋ณธ ๊ธ€์€ CSF(Curation Service for Facilitation)๋กœ ์ธ์šฉ๋œ(์ฐธ์กฐ๋œ) ๋ชจ๋“  ์ถœ์ฒ˜๋Š” ์ƒ๋žตํ•ฉ๋‹ˆ๋‹ค.

์ธ๊ณต์ง€๋Šฅ ์ž‘์‚ฌ๊ฐ€

์ธ๊ณต์ง€๋Šฅ ์ž‘์‚ฌ๊ฐ€

๋ฐ์ดํ„ฐ ์ฝ์–ด์˜ค๊ธฐ

import numpy as np
import tensorflow as tf
import glob

txt_file_path = './lyricist/data/lyrics/*'
# ๊ฒฝ๋กœ ์ง€์ •

txt_list = glob.glob(txt_file_path)
# https://wikidocs.net/3746 
# ํ•ด๋‹น ๊ฒฝ๋กœ ํด๋”์˜ ๋ชจ๋“  ํ•˜์œ„ ํด๋”๊นŒ์ง€ ํƒ์ƒ‰ํ•˜์—ฌ txt_list์— ๋„ฃ์Œ

raw_corpus = []

# ์—ฌ๋Ÿฌ๊ฐœ์˜ txt ํŒŒ์ผ์„ ๋ชจ๋‘ ์ฝ์–ด์„œ raw_corpus ์— ๋‹ด์Šต๋‹ˆ๋‹ค.
for txt_file in txt_list:
    with open(txt_file, "r") as f:
        raw = f.read().splitlines()
        raw_corpus.extend(raw)

print("๋ฐ์ดํ„ฐ ํฌ๊ธฐ:", len(raw_corpus))
print("Examples:\n", raw_corpus[:3])
๋ฐ์ดํ„ฐ ํฌ๊ธฐ: 187088
Examples:
 ['The first words that come out', 'And I can see this song will be about you', "I can't believe that I can breathe without you"]
# ์ž…๋ ฅ๋œ ๋ฌธ์žฅ์„
#     1. ์†Œ๋ฌธ์ž๋กœ ๋ฐ”๊พธ๊ณ , ์–‘์ชฝ ๊ณต๋ฐฑ์„ ์ง€์›๋‹ˆ๋‹ค
#     2. ํŠน์ˆ˜๋ฌธ์ž ์–‘์ชฝ์— ๊ณต๋ฐฑ์„ ๋„ฃ๊ณ 
#     3. ์—ฌ๋Ÿฌ๊ฐœ์˜ ๊ณต๋ฐฑ์€ ํ•˜๋‚˜์˜ ๊ณต๋ฐฑ์œผ๋กœ ๋ฐ”๊ฟ‰๋‹ˆ๋‹ค
#     4. a-zA-Z?.!,ยฟ๊ฐ€ ์•„๋‹Œ ๋ชจ๋“  ๋ฌธ์ž๋ฅผ ํ•˜๋‚˜์˜ ๊ณต๋ฐฑ์œผ๋กœ ๋ฐ”๊ฟ‰๋‹ˆ๋‹ค
#     5. ๋‹ค์‹œ ์–‘์ชฝ ๊ณต๋ฐฑ์„ ์ง€์›๋‹ˆ๋‹ค
#     6. ๋ฌธ์žฅ ์‹œ์ž‘์—๋Š” <start>, ๋์—๋Š” <end>๋ฅผ ์ถ”๊ฐ€ํ•ฉ๋‹ˆ๋‹ค
# ์ด ์ˆœ์„œ๋กœ ์ฒ˜๋ฆฌํ•ด์ฃผ๋ฉด ๋ฌธ์ œ๊ฐ€ ๋˜๋Š” ์ƒํ™ฉ์„ ๋ฐฉ์ง€ํ•  ์ˆ˜ ์žˆ๊ฒ ๋„ค์š”!
import re 

def preprocess_sentence(sentence):
    sentence = sentence.lower().strip() # 1
    sentence = re.sub(r"([?.!,ยฟ])", r" \1 ", sentence) # 2
    sentence = re.sub(r'[" "]+', " ", sentence) # 3
    sentence = re.sub(r"[^a-zA-Z?.!,ยฟ]+", " ", sentence) # 4
    sentence = sentence.strip() # 5
    sentence = '<start> ' + sentence + ' <end>' # 6
    return sentence

# ์ด ๋ฌธ์žฅ์ด ์–ด๋–ป๊ฒŒ ํ•„ํ„ฐ๋ง๋˜๋Š”์ง€ ํ™•์ธํ•ด ๋ณด์„ธ์š”.

print(preprocess_sentence("This @_is ;;;sample   23423     sentence."))
<start> this is sample sentence . <end>
#############################################

#    # ์ถ”๊ฐ€๋กœ ์ง€๋‚˜์น˜๊ฒŒ ๊ธด ๋ฌธ์žฅ์€ ๋‹ค๋ฅธ ๋ฐ์ดํ„ฐ๋“ค์ด ๊ณผ๋„ํ•œ Padding์„ ๊ฐ–๊ฒŒ ํ•˜๋ฏ€๋กœ ์ œ๊ฑฐ

corpus = [] #์ •์ œ๋œ ๋ฌธ์žฅ ๋ชจ์œผ๋Š” ๊ณณ

for sentence in raw_corpus:
    if len(sentence) == 0: continue
    if len(sentence) > 100: continue # ๊ณผ๋„ํ•˜๊ฒŒ ๊ธด๋ฌธ์žฅ ์—†์• ๊ธฐ
    if sentence[-1] == ":": continue
    
    # ์ •์ œํ•˜๊ธฐ
    preprocessed_sentence = preprocess_sentence(sentence)

    
    # ํ† ํฐ์˜ ๊ฐœ์ˆ˜๊ฐ€ 15๊ฐœ๋ฅผ ๋„˜์–ด๊ฐ€๋Š” ๋ฌธ์žฅ์€ ์ œ์™ธ
    if len(preprocessed_sentence.split()) > 15: continue
    
    # ๋‹ด๊ธฐ
    corpus.append(preprocessed_sentence)
    
# ์ •์ œ๋œ ๊ฒฐ๊ณผ ํ™•์ธ
corpus[:1]
['<start> the first words that come out <end>']
def tokenize(corpus):

    tokenizer = tf.keras.preprocessing.text.Tokenizer(
        num_words=12000, 
        filters=' ',
        oov_token="<unk>"
    )
    # corpus๋ฅผ ์ด์šฉํ•ด tokenizer ๋‚ด๋ถ€์˜ ๋‹จ์–ด์žฅ์„ ์™„์„ฑํ•ฉ๋‹ˆ๋‹ค
    tokenizer.fit_on_texts(corpus)
    
    # ์ค€๋น„ํ•œ tokenizer๋ฅผ ์ด์šฉํ•ด corpus๋ฅผ Tensor๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค
    tensor = tokenizer.texts_to_sequences(corpus)   
    
    # ์ž…๋ ฅ ๋ฐ์ดํ„ฐ์˜ ์‹œํ€€์Šค ๊ธธ์ด๋ฅผ ์ผ์ •ํ•˜๊ฒŒ ๋งž์ถฐ์ค๋‹ˆ๋‹ค
    # ๋งŒ์•ฝ ์‹œํ€€์Šค๊ฐ€ ์งง๋‹ค๋ฉด ๋ฌธ์žฅ ๋’ค์— ํŒจ๋”ฉ์„ ๋ถ™์—ฌ ๊ธธ์ด๋ฅผ ๋งž์ถฐ์ค๋‹ˆ๋‹ค.
    # ๋ฌธ์žฅ ์•ž์— ํŒจ๋”ฉ์„ ๋ถ™์—ฌ ๊ธธ์ด๋ฅผ ๋งž์ถ”๊ณ  ์‹ถ๋‹ค๋ฉด padding='pre'๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post', maxlen=15)  
    
    print(tensor, tokenizer)
    return tensor, tokenizer

tensor, tokenizer = tokenize(corpus)
[[  2   6 248 ...   0   0   0]
 [  2   8   4 ...   0   0   0]
 [  2   4  35 ...   0   0   0]
 ...
 [  2 124 112 ...   0   0   0]
 [  2 124 112 ...   0   0   0]
 [  2 124 112 ...   0   0   0]] <keras_preprocessing.text.Tokenizer object at 0x7fca320329d0>
for idx, sentence in enumerate(raw_corpus):
    if len(sentence) == 0: continue   # ๊ธธ์ด๊ฐ€ 0์ธ ๋ฌธ์žฅ์€ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.
    if sentence[-1] == ":": continue  # ๋ฌธ์žฅ์˜ ๋์ด : ์ธ ๋ฌธ์žฅ์€ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.

    if idx > 9: break   # ์ผ๋‹จ ๋ฌธ์žฅ 10๊ฐœ๋งŒ ํ™•์ธํ•ด ๋ณผ ๊ฒ๋‹ˆ๋‹ค.
        
    print(sentence)
The first words that come out
And I can see this song will be about you
I can't believe that I can breathe without you
But all I need to do is carry on
The next line I write down
And there's a tear that falls between the pages
I know that pain's supposed to heal in stages
But it depends which one I'm standing on I write lines down, then rip them up
Describing love can't be this tough I could set this song on fire, send it up in smoke
I could throw it in the river and watch it sink in slowly
# tokenizer์— ๊ตฌ์ถ•๋œ ๋‹จ์–ด ์‚ฌ์ „์˜ ์ธ๋ฑ์Šค ์ถœ๋ ฅ
print(tensor[:5, :]) 
[[  2   6 248 436  15  68  57   3   0   0   0   0   0   0   0]
 [  2   8   4  35  63  41 357  84  27 111   7   3   0   0   0]
 [  2   4  35  16 218  15   4  35 767 257   7   3   0   0   0]
 [  2  33  25   4  92  10  48  26 829  18   3   0   0   0   0]
 [  2   6 331 441   4 759  58   3   0   0   0   0   0   0   0]]
# ์ •์ œ ํ›„ ํ…์„œ ํฌ๊ธฐ ์ถœ๋ ฅ
print(len(tensor), len(corpus))
156013 156013
# ๋‹จ์–ด์žฅ์ด ์–ด๋–ป๊ฒŒ ๊ตฌ์ถ•๋˜์—ˆ๋Š”์ง€ ํ™•์ธ
for idx in tokenizer.index_word:
    print(idx, ":", tokenizer.index_word[idx])

    if idx >= 10: break
1 : <unk>
2 : <start>
3 : <end>
4 : i
5 : ,
6 : the
7 : you
8 : and
9 : a
10 : to
# tensor์—์„œ ๋งˆ์ง€๋ง‰ ํ† ํฐ์„ ์ž˜๋ผ๋‚ด์„œ ์†Œ์Šค ๋ฌธ์žฅ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค
# ๋งˆ์ง€๋ง‰ ํ† ํฐ์€ <end>๊ฐ€ ์•„๋‹ˆ๋ผ <pad>์ผ ๊ฐ€๋Šฅ์„ฑ์ด ๋†’์Šต๋‹ˆ๋‹ค.
src_input = tensor[:, :-1]  
# tensor์—์„œ <start>๋ฅผ ์ž˜๋ผ๋‚ด์„œ ํƒ€๊ฒŸ ๋ฌธ์žฅ์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
tgt_input = tensor[:, 1:]    

print(src_input[0])
print(tgt_input[0])
[  2   6 248 436  15  68  57   3   0   0   0   0   0   0]
[  6 248 436  15  68  57   3   0   0   0   0   0   0   0]
# ํ‰๊ฐ€ ๋ฐ์ดํ„ฐ์…‹ ๋ถ„๋ฆฌ
# 20%๋ฅผ ํ‰๊ฐ€์šฉ
from sklearn.model_selection import train_test_split
enc_train, enc_val, dec_train, dec_val = train_test_split(src_input, 
                                                    tgt_input, 
                                                    test_size=0.2, 
                                                    random_state=42)
print("Source Train:", enc_train.shape)
print("Target Train:", dec_train.shape)
Source Train: (124810, 14)
Target Train: (124810, 14)
BUFFER_SIZE = len(src_input)
BATCH_SIZE = 256
steps_per_epoch = len(src_input) // BATCH_SIZE

 # tokenizer๊ฐ€ ๊ตฌ์ถ•ํ•œ ๋‹จ์–ด์‚ฌ์ „ ๋‚ด 12000๊ฐœ์™€, ์—ฌ๊ธฐ ํฌํ•จ๋˜์ง€ ์•Š์€ 0:<pad>๋ฅผ ํฌํ•จํ•˜์—ฌ 7001๊ฐœ
VOCAB_SIZE = tokenizer.num_words + 1   

# ์ค€๋น„ํ•œ ๋ฐ์ดํ„ฐ ์†Œ์Šค๋กœ๋ถ€ํ„ฐ ๋ฐ์ดํ„ฐ์…‹์„ ๋งŒ๋“ญ๋‹ˆ๋‹ค
# ๋ฐ์ดํ„ฐ์…‹์— ๋Œ€ํ•ด์„œ๋Š” ์•„๋ž˜ ๋ฌธ์„œ๋ฅผ ์ฐธ๊ณ ํ•˜์„ธ์š”
# ์ž์„ธํžˆ ์•Œ์•„๋‘˜์ˆ˜๋ก ๋„์›€์ด ๋งŽ์ด ๋˜๋Š” ์ค‘์š”ํ•œ ๋ฌธ์„œ์ž…๋‹ˆ๋‹ค
# https://www.tensorflow.org/api_docs/python/tf/data/Dataset
# train ๋ฐ์ดํ„ฐ์…‹
train_dataset = tf.data.Dataset.from_tensor_slices((enc_train, dec_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)
print(train_dataset)

# test ๋ฐ์ดํ„ฐ์…‹
test_dataset = tf.data.Dataset.from_tensor_slices((enc_val, dec_val))
test_dataset = test_dataset.shuffle(BUFFER_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)
print(test_dataset)
<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>
<BatchDataset shapes: ((256, 14), (256, 14)), types: (tf.int32, tf.int32)>
# ์ธ๊ณต์ง€๋Šฅ ๋งŒ๋“ค๊ธฐ
class TextGenerator(tf.keras.Model):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super().__init__()
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_size)
        self.rnn_1 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.rnn_2 = tf.keras.layers.LSTM(hidden_size, return_sequences=True)
        self.linear = tf.keras.layers.Dense(vocab_size)
        
    def call(self, x):
        out = self.embedding(x)
        out = self.rnn_1(out)
        out = self.rnn_2(out)
        out = self.linear(out)
        
        return out
    
embedding_size = 256  # ๊ฐ’์ด ์ปค์งˆ์ˆ˜๋ก ๋‹จ์–ด์˜ ์ถ”์ƒ์ ์ธ ํŠน์ง•๋“ค์„ ๋” ์žก์•„๋‚ผ ์ˆ˜ ์žˆ์ง€๋งŒ, ๋ฐ์ดํ„ฐ ์–‘์ด ์ถฉ๋ถ„ํ•ด์•ผํ•จ
hidden_size = 1024   # ๋ชจ๋ธ์— ์–ผ๋งˆ๋‚˜ ๋งŽ์€ ์ผ๊พผ์„ ๋‘˜ ๊ฒƒ์ธ๊ฐ€, ์ถฉ๋ถ„ํ•œ ๋ฐ์ดํ„ฐ๊ฐ€ ์ฃผ์–ด์ ธ์•ผ ๋ฐฐ๊ฐ€ ์‚ฐ์œผ๋กœ ๊ฐ€์ง€ ์•Š์Œ
lyricist = TextGenerator(tokenizer.num_words + 1, embedding_size , hidden_size)
# ๋ฐ์ดํ„ฐ์…‹์—์„œ ๋ฐ์ดํ„ฐ ํ•œ ๋ฐฐ์น˜๋งŒ ๋ถˆ๋Ÿฌ์˜ค๋Š” ๋ฐฉ๋ฒ•์ž…๋‹ˆ๋‹ค.
# ์ง€๊ธˆ์€ ๋™์ž‘ ์›๋ฆฌ์— ๋„ˆ๋ฌด ๋น ์ ธ๋“ค์ง€ ๋งˆ์„ธ์š”~
for src_sample, tgt_sample in train_dataset.take(1): break

# ํ•œ ๋ฐฐ์น˜๋งŒ ๋ถˆ๋Ÿฌ์˜จ ๋ฐ์ดํ„ฐ๋ฅผ ๋ชจ๋ธ์— ๋„ฃ์–ด๋ด…๋‹ˆ๋‹ค
lyricist(src_sample)
<tf.Tensor: shape=(256, 14, 12001), dtype=float32, numpy=
array([[[-1.44549223e-04, -1.15473573e-04, -6.30542418e-05, ...,
         -4.81275529e-05,  3.25414061e-04,  2.03498232e-04],
        [-1.86848993e-04, -2.30316669e-04, -2.25731535e-04, ...,
         -6.46176923e-05,  5.04140393e-04,  3.21763946e-04],
        [-1.03205770e-04, -4.84468270e-04, -1.17466385e-04, ...,
         -4.00223624e-04,  6.36043027e-04,  5.16410160e-04],
        ...,
        [-1.38640136e-03, -4.47364670e-04, -2.00071765e-04, ...,
         -2.70744367e-05,  5.63458947e-04,  1.36734487e-03],
        [-1.38968823e-03, -2.18201923e-04, -5.19401219e-05, ...,
          6.40518032e-04,  7.30469066e-04,  1.37798791e-03],
        [-1.20439660e-03, -3.50640657e-05,  1.76810892e-04, ...,
          1.40748988e-03,  9.31126706e-04,  1.26391207e-03]],

       [[-1.44549223e-04, -1.15473573e-04, -6.30542418e-05, ...,
         -4.81275529e-05,  3.25414061e-04,  2.03498232e-04],
        [-2.94670928e-04, -8.99762235e-05, -2.69432086e-04, ...,
          8.99600991e-07,  4.78153932e-04,  9.61799306e-05],
        [-4.10166569e-04, -5.20658679e-04, -2.53264647e-04, ...,
          3.23973043e-04,  5.47863834e-04, -1.30365908e-04],
        ...,
        [ 1.13907992e-03, -9.15171870e-04,  8.65232723e-04, ...,
          3.22713796e-03,  6.88177999e-04,  6.72521652e-04],
        [ 1.39695173e-03, -7.76036293e-04,  1.17004605e-03, ...,
          3.59163154e-03,  7.81425682e-04,  4.78076487e-04],
        [ 1.59162667e-03, -6.71598536e-04,  1.45776011e-03, ...,
          3.88893508e-03,  8.60124943e-04,  2.58787506e-04]],

       [[-1.44549223e-04, -1.15473573e-04, -6.30542418e-05, ...,
         -4.81275529e-05,  3.25414061e-04,  2.03498232e-04],
        [-9.57229131e-05,  4.74091621e-06, -6.54251926e-05, ...,
         -2.03965265e-05,  3.87494307e-04,  2.87009287e-04],
        [ 7.70175102e-05,  2.29143661e-05, -1.12108835e-04, ...,
         -6.15792305e-05,  7.54494278e-04,  2.59941327e-04],
        ...,
        [-3.39211168e-04, -7.62147596e-04,  7.07902480e-04, ...,
          1.03856390e-03,  2.22608724e-04,  5.74758451e-04],
        [-1.15578106e-04, -5.49393706e-04,  9.38904588e-04, ...,
          1.59561902e-03,  3.60214763e-04,  6.75352698e-04],
        [ 1.59531934e-04, -3.65949731e-04,  1.20420614e-03, ...,
          2.15193047e-03,  5.24571515e-04,  6.74697279e-04]],

       ...,

       [[-1.44549223e-04, -1.15473573e-04, -6.30542418e-05, ...,
         -4.81275529e-05,  3.25414061e-04,  2.03498232e-04],
        [-1.11118985e-04, -1.70493659e-04,  7.63675689e-06, ...,
         -1.08443775e-04,  5.13266714e-04,  2.32881168e-04],
        [-2.32437969e-07, -3.77636781e-04,  2.63728609e-04, ...,
          6.40669605e-05,  7.68769416e-04, -7.61850824e-05],
        ...,
        [-3.61508537e-05,  2.39523928e-04,  1.20974041e-03, ...,
          1.75854599e-03,  1.03051902e-03, -2.45660427e-04],
        [ 2.07302044e-04,  2.38900306e-04,  1.42566522e-03, ...,
          2.29557604e-03,  1.19230570e-03, -2.90194992e-04],
        [ 4.27131599e-04,  2.16009008e-04,  1.64476351e-03, ...,
          2.76157586e-03,  1.31183036e-03, -3.64537846e-04]],

       [[-1.44549223e-04, -1.15473573e-04, -6.30542418e-05, ...,
         -4.81275529e-05,  3.25414061e-04,  2.03498232e-04],
        [-1.79736468e-04, -8.09120102e-05, -1.06378997e-04, ...,
         -5.81318745e-05,  2.21763956e-04,  8.87998249e-05],
        [-5.21522888e-04, -4.01913829e-04,  8.20151035e-05, ...,
         -7.17598687e-06,  2.78848631e-04, -2.14788488e-05],
        ...,
        [-1.21572160e-03, -6.60044549e-04, -2.85983660e-05, ...,
          1.35207723e-03,  6.91875233e-04, -8.14947416e-04],
        [-8.80337728e-04, -5.37527958e-04,  1.78775561e-04, ...,
          1.90995191e-03,  9.32036142e-04, -6.58150821e-04],
        [-5.19666821e-04, -4.26618091e-04,  4.55988979e-04, ...,
          2.46126065e-03,  1.15186919e-03, -5.66828065e-04]],

       [[-1.44549223e-04, -1.15473573e-04, -6.30542418e-05, ...,
         -4.81275529e-05,  3.25414061e-04,  2.03498232e-04],
        [-3.42270418e-04, -2.84511450e-04,  7.37812079e-05, ...,
          1.81375945e-04,  3.50613933e-04,  2.93300021e-04],
        [-5.02196606e-04, -4.85470606e-04,  4.54048219e-04, ...,
          4.33529960e-04,  1.44814345e-04,  4.72123036e-04],
        ...,
        [ 3.60102160e-04,  3.53560550e-04,  1.83492934e-03, ...,
          3.30801494e-03,  8.18313390e-04, -1.35901757e-03],
        [ 5.72396151e-04,  2.90043768e-04,  1.98598928e-03, ...,
          3.61621543e-03,  8.88703857e-04, -1.44164031e-03],
        [ 7.50529172e-04,  2.19747817e-04,  2.14289455e-03, ...,
          3.86781618e-03,  9.42276383e-04, -1.49855157e-03]]],
      dtype=float32)>
lyricist.summary()
Model: "text_generator"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        multiple                  3072256   
_________________________________________________________________
lstm (LSTM)                  multiple                  5246976   
_________________________________________________________________
lstm_1 (LSTM)                multiple                  8392704   
_________________________________________________________________
dense (Dense)                multiple                  12301025  
=================================================================
Total params: 29,012,961
Trainable params: 29,012,961
Non-trainable params: 0
_________________________________________________________________
optimizer = tf.keras.optimizers.Adam()

#Loss
loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)

lyricist.compile(loss=loss, 
                     optimizer=optimizer, 
                     metrics=['accuracy']) # ์ •ํ™•์„ฑ ํŒ๋‹จ

lyrics_history = lyricist.fit(train_dataset, # ํ›ˆ๋ จ ๋ฐ์ดํ„ฐ
                                  validation_data=test_dataset, # ํ‰๊ฐ€ ๋ฐ์ดํ„ฐ
                                  epochs=10)
Epoch 1/10
487/487 [==============================] - 100s 201ms/step - loss: 3.5322 - accuracy: 0.4808 - val_loss: 3.1559 - val_accuracy: 0.5119
Epoch 2/10
487/487 [==============================] - 98s 201ms/step - loss: 3.0327 - accuracy: 0.5202 - val_loss: 2.9604 - val_accuracy: 0.5263
Epoch 3/10
487/487 [==============================] - 98s 202ms/step - loss: 2.8649 - accuracy: 0.5308 - val_loss: 2.8512 - val_accuracy: 0.5338
Epoch 4/10
487/487 [==============================] - 98s 202ms/step - loss: 2.7386 - accuracy: 0.5393 - val_loss: 2.7715 - val_accuracy: 0.5406
Epoch 5/10
487/487 [==============================] - 98s 202ms/step - loss: 2.6322 - accuracy: 0.5470 - val_loss: 2.7075 - val_accuracy: 0.5464
Epoch 6/10
487/487 [==============================] - 98s 202ms/step - loss: 2.5367 - accuracy: 0.5544 - val_loss: 2.6574 - val_accuracy: 0.5517
Epoch 7/10
487/487 [==============================] - 98s 202ms/step - loss: 2.4491 - accuracy: 0.5618 - val_loss: 2.6150 - val_accuracy: 0.5572
Epoch 8/10
487/487 [==============================] - 98s 202ms/step - loss: 2.3682 - accuracy: 0.5695 - val_loss: 2.5784 - val_accuracy: 0.5624
Epoch 9/10
487/487 [==============================] - 98s 202ms/step - loss: 2.2921 - accuracy: 0.5774 - val_loss: 2.5480 - val_accuracy: 0.5676
Epoch 10/10
487/487 [==============================] - 98s 202ms/step - loss: 2.2198 - accuracy: 0.5858 - val_loss: 2.5225 - val_accuracy: 0.5724
# EX06์—์„œ ๋ฐฐ์šด ๊ฑฐ ์จ๋จน์–ด ๋ณด๊ธฐ
import matplotlib.pyplot as plt

acc = lyrics_history.history['accuracy']
val_acc = lyrics_history.history['val_accuracy']

loss = lyrics_history.history['loss']
val_loss = lyrics_history.history['val_loss']

epochs_range = range(len(acc))

plt.figure(figsize = (12, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label = 'Training Accuracy')
plt.plot(epochs_range, val_acc, label = 'Validation Accuracy')
plt.legend(loc = 'lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label = 'Training Loss')
plt.plot(epochs_range, val_loss, label = 'Validation Loss')
plt.legend(loc = 'upper right')
plt.title('Training and Validation Loss')
plt.show()

์—ฌ๊ธฐ์„œ ์ •ํ™•๋„๊ฐ€ ๋” ์˜ฌ๋ผ๊ฐˆ ์ˆ˜ ์žˆ๊ณ 
๋กœ์Šค๋„ ๋” ์ค„์ผ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ epoch๋ฅผ ๋” ์ง„ํ–‰ํ•ด์ฃผ์–ด๋„ ์ข‹์Œ
์•„๋‹ˆ๋ฉด Embedding_size์™€ Hidden_size๋ฅผ ๋†’์—ฌ์ฃผ๋Š” ๊ฒƒ์ด ์ข‹์„ ๋“ฏ.

def generate_text(lyricist, tokenizer, init_sentence="<start>", max_len=20):
    # ํ…Œ์ŠคํŠธ๋ฅผ ์œ„ํ•ด์„œ ์ž…๋ ฅ๋ฐ›์€ init_sentence๋„ ํ…์„œ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค
    test_input = tokenizer.texts_to_sequences([init_sentence])
    test_tensor = tf.convert_to_tensor(test_input, dtype=tf.int64)
    end_token = tokenizer.word_index["<end>"]

    # ๋‹จ์–ด ํ•˜๋‚˜์”ฉ ์˜ˆ์ธกํ•ด ๋ฌธ์žฅ์„ ๋งŒ๋“ญ๋‹ˆ๋‹ค
    #    1. ์ž…๋ ฅ๋ฐ›์€ ๋ฌธ์žฅ์˜ ํ…์„œ๋ฅผ ์ž…๋ ฅํ•ฉ๋‹ˆ๋‹ค
    #    2. ์˜ˆ์ธก๋œ ๊ฐ’ ์ค‘ ๊ฐ€์žฅ ๋†’์€ ํ™•๋ฅ ์ธ word index๋ฅผ ๋ฝ‘์•„๋ƒ…๋‹ˆ๋‹ค
    #    3. 2์—์„œ ์˜ˆ์ธก๋œ word index๋ฅผ ๋ฌธ์žฅ ๋’ค์— ๋ถ™์ž…๋‹ˆ๋‹ค
    #    4. ๋ชจ๋ธ์ด <end>๋ฅผ ์˜ˆ์ธกํ–ˆ๊ฑฐ๋‚˜, max_len์— ๋„๋‹ฌํ–ˆ๋‹ค๋ฉด ๋ฌธ์žฅ ์ƒ์„ฑ์„ ๋งˆ์นฉ๋‹ˆ๋‹ค
    while True:
        # 1
        predict = lyricist(test_tensor) 
        # 2
        predict_word = tf.argmax(tf.nn.softmax(predict, axis=-1), axis=-1)[:, -1] 
        # 3 
        test_tensor = tf.concat([test_tensor, tf.expand_dims(predict_word, axis=0)], axis=-1)
        # 4
        if predict_word.numpy()[0] == end_token: break
        if test_tensor.shape[1] >= max_len: break

    generated = ""
    # tokenizer๋ฅผ ์ด์šฉํ•ด word index๋ฅผ ๋‹จ์–ด๋กœ ํ•˜๋‚˜์”ฉ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค 
    for word_index in test_tensor[0].numpy():
        generated += tokenizer.index_word[word_index] + " "

    return generated
# ๋ฌธ์žฅ ์ถœ๋ ฅ
generate_text(lyricist, tokenizer, init_sentence="<start> I ", max_len=20)
'<start> i m a survivor <end> '

ํšŒ๊ณ 

  • ์ธ๊ณต์ง€๋Šฅ์—์„œ ๊ฐ€์žฅ ๊ด€์‹ฌ์„ ๊ฐ–๊ณ  ์žˆ๋Š” ๋ถ„์•ผ๊ฐ€ NLP๋ถ„์•ผ์˜€๋Š”๋ฐ ์ƒ๊ฐํ•œ ๊ฒƒ๋ณด๋‹ค ๋งŽ์ด ์–ด๋ ต๋‹ค๊ณ  ๋Š๋‚€๋‹ค.
  • ๋ฌธ์žฅ์˜ ๊ธธ์ด๋ฅผ ๋งˆ์Œ๋Œ€๋กœ ์กฐ์ ˆํ•  ์ˆ˜ ์žˆ๋Š” ์ง€ ๊ถ๊ธˆํ•ด์กŒ๋‹ค.
  • ์ƒ๊ฐ๋ณด๋‹ค optimizer ์‹œ๊ฐ„์ด ์˜ค๋ž˜๊ฑธ๋ฆฐ๋‹ค.
  • ์—ญ์‹œ ์žฅ๋น„๊ฐ€ ์ข‹์•„์•ผํ•˜๋‚˜ ํ•˜๋Š” ์ƒ๊ฐ์ด ๋“ค์—ˆ๋‹ค.

๋ฐ์ดํ„ฐ

  • ๋ฐ์ดํ„ฐ ๋ถ„๋ฆฌํ•˜๋Š” ๋ถ€๋ถ„์—์„œ ๊ฝค ์˜ค๋žœ ์‹œ๊ฐ„์„ ์Ÿ์•˜๋‹ค.
  • ์ผ๋‹จ ์ •๋ฆฌ๋Š” ํ–ˆ์ง€๋งŒ ์•„์ง ์ž˜ ์ดํ•ด๊ฐ€ ์•ˆ๊ฐ€๋Š” ๋ถ€๋ถ„์ด ์žˆ๋‹ค.

ํ† ํฐ์— ๋Œ€ํ•˜์—ฌ

  • ํ† ํฐ์„ ๋‹จ์–ด๋กœ ๋ณด์•„์•ผ ํ•˜๋Š”์ง€์— ๋Œ€ํ•ด ๊ฐœ๋…์ด ์•„์ง ์•ˆ์žกํžŒ๊ฒƒ ๊ฐ™๋‹ค.
  • ์ž์—ฐ์–ด๋ฅผ ์ฒ˜๋ฆฌํ•  ๋•Œ ๊ผญ ํ† ํฐ์„ ์จ์•ผ ํ•˜๋Š”๊ฐ€์— ๋Œ€ํ•ด ์—ฌ๋Ÿฌ๊ฐ€์ง€๋ฅผ ์ฐพ์•„๋ดค๋Š”๋ฐ
  • ๋‹ค๋ฅธ ํ•จ์ˆ˜๋ฅผ ์ด์šฉํ•˜์—ฌ ์ฒ˜๋ฆฌํ•˜๋Š” ๊ฒฝ์šฐ๋„ ์žˆ๋‹ค๋Š” ๊ฒƒ์„ ์•Œ์•˜๋‹ค.
profile
๋งˆ์ผ€ํŒ…์„ ์œ„ํ•œ ์ธ๊ณต์ง€๋Šฅ ์„ค๊ณ„์™€ ์Šคํƒ€ํŠธ์—… Log

0๊ฐœ์˜ ๋Œ“๊ธ€