import os #os
from glob import glob #local filelist 조회
import tensorflow as tf
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline
np.random.seed(7777)
tf.random.set_seed(7777)
os.listdir("../../datasets/") #directory 리스트 형태로 조회
glob("../../datasets/cifar/train/*.png")
train_img = glob("../../datasets/cifar/train/*.png") #png확장자 조회 T/F
path = train_img[0]
gfile = tf.io.read_file(path) #binary형태로 encoding
image = tf.io.decode_image(gfile) # binary -> tensor형태로
def read_image(path):
gfile = tf.io.read_file(path)
image = tf.io.decode_image(gfile, dtype=tf.float32)
return image
dataset = tf.data.Dataset.from_tensor_slices(train_img)
AUTOTUNE = tf.data.experimental.AUTOTUNE
# map은 tensor데이터만 넣어야 한다
dataset = dataset.map(read_image, num_parallel_calls=AUTOTUNE)
# num_parallel_calls은 병렬처리 수준을 정하는 것, AUTOTUNE으로 해두면 자동으로 정해진다.
next(iter(dataset))
dataset = dataset.batch(128)
next(iter(dataset)).shape # TensorShape([128, 32, 32, 3])
prefetch
dataset = dataset.prefetch(1)
# dataset = dataset.prefetch(AUTOTUNE) # TF가 자동으로(동적으로)할당 한다.
buffer_size
buffer_size
shuffle 정도dataset = dataset.shuffle(buffer_size=10)
repeat
dataset = dataset.repeat()
보통 기본 설정
AUTOTUNE = tf.data.experimental.AUTOTUNE
dataset = dataset.map(read_image, num_parallel_calls=AUTOTUNE)
dataset = dataset.batch(128)
dataset = dataset.prefetch(AUTOTUNE)
dataset = dataset.shuffle(buffer_size=10)
dataset = dataset.repeat()
# 파일명에서 구분
train_img[0].split("/")[-1].split(".")[0].split("_")[-1]
label_txt = tf.io.read_file("../../datasets/cifar/labels.txt")
label_names = np.array(label_txt.numpy().decode('ascii').strip().split("\n"))
# onehotencoding
def parse_label(path):
name = path.split("/")[-1].split(".")[0].split("_")[-1]
return np.array(name == label_names, dtype=np.float32)
train_y = [parse_label(y) for y in train_img]
def read_data(path, label):
img = read_image(path)
return img, label
dataset = tf.data.Dataset.from_tensor_slices((train_img, train_y))
dataset = dataset.map(read_data, num_parallel_calls=AUTOTUNE)
dataset = dataset.prefetch(1)
dataset = dataset.batch(4)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat() # 인수를 안넣으면 무한히 반복
next(iter(dataset))
def get_label(path):
fname = tf.strings.split(path, '_')[-1]
lbl_name = tf.strings.regex_replace(fname, '.png', '')
onehot = tf.cast(lbl_name == label_names, tf.uint8)
return onehot
def load_image_label(path):
gfile = tf.io.read_file(path)
image = tf.io.decode_image(gfile, dtype=tf.float32)
label = get_label(path)
return image, label
dataset = tf.data.Dataset.from_tensor_slices(train_img)
dataset = dataset.map(load_image_label, num_parallel_calls=AUTOTUNE)
dataset = dataset.prefetch(1)
dataset = dataset.batch(4)
dataset = dataset.shuffle(buffer_size=1)
dataset = dataset.repeat()
next(iter(dataset))
Reference
1) 제로베이스 데이터스쿨 강의자료