CrossEntropyLoss
수식. Loss Function 종류 중 하나.import torch
import torch.nn as nn
loss_function = nn.CrossEntropyLoss()
loss = loss_function(torch.tensor([[
0.8982,
0.805,
0.6393,
0.9983,
0.5731,
0.0469,
0.556,
0.1476,
1.2404,
0.5544
]]), torch.LongTensor([3]))
loss.item() # 2.0085251331329346
loss = loss_function(torch.Tensor([[
3e-5,
5e-3,
1e-6,
0.9204,
2e-3,
3e-4,
5e-4,
5e-5,
0.176,
5e-4
]]), torch.LongTensor([3]))
loss.item() # 1.5401395559310913
loss = loss_function(torch.Tensor([[
3e-8,
5e-5,
1e-6,
2.4204,
2e-5,
3e-5,
5e-4,
5e-5,
6e-4,
5e-4
]]), torch.LongTensor([3]))
loss.item() # 0.5878590941429138
모델 최적화 알고리즘 요약
WX+B
라는 Single layer perceptron과 동일한 결과를 낸다. X
에 곱해지는 항들은 W로
치환가능하고, 입력과 무관한 상수들은 전체를 B로 치환 가능하기 때문에 WX+B
라는 Single layer perceptron과 동일한 결과를 낸다.Step
Sigmoid
Gradient Vanishing
현상이 발생한다.tanh
ReLU
Leaky ReLU
class Net(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(Net,self).__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
out = self.fc1(x)
out = self.relu(out)
out = self.fc2(out)
return out
순방향 신경망 (Feedforward Neural Network)
순환 신경망 (Recurrent Neural Network)
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
images = Variable(images.view(-1, 28 * 28)).cuda()
labels = Variable(labels).cuda()
optimizer.zero_grad() #
outputs = net(images) # 순전파
loss = loss_function(outputs, labels) # loss
loss.backward() # 역전파
optimizer.step() # 최적화
Wx + b
를 적용하고 Relu 함수를 적용했다. 이 때 x는 이 전 노드의 출력 값이다.출력층 업데이트
은닉층 업데이트
import numpy as np
def sigmoid(x: np.ndarray) -> np.ndarray:
return 1 / (1 + np.exp(-x))
def relu(x: np.ndarray) -> np.ndarray:
return x * (x > 0)
def leaky_relu(x: np.ndarray, alpha: float = 0.01) -> np.ndarray:
return np.where(x > 0, x, alpha * x)
def post_processing(predictions: np.ndarray) -> np.ndarray:
return np.where(predictions < 0.5, 0, 1)
def display_results(inputs: np.ndarray, predictions: np.ndarray) -> None:
processed_predictions = post_processing(predictions)
print("Input (A, B) | Predicted Y")
print("---------------------------")
for i in range(inputs.shape[1]):
print(f" {inputs[0, i]}, {inputs[1, i]} | {processed_predictions[0, i]}")
def initialize_parameters() -> dict[str, np.ndarray]:
parameters = {
"W1": np.random.randn(2, 2), # 가중치 | INPUT(2 units) -> Hidden Layer(2 units)
"b1": np.zeros((2, 1)), # 편향 | Hidden Layer(2 units)
"W2": np.random.randn(1, 2), # 가중치 | Hidden Layer(2 units) -> Output(1 unit)
"b2": np.zeros((1, 1)) # 편향 | Output(1 unit)
}
return parameters
def compute_loss(Y: np.ndarray, Y_hat: np.ndarray) -> np.ndarray:
# BCE (Binary Cross Entropy)
m = Y.shape[0]
loss = -np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)) / m
return loss
def forward_propagation(
X: np.ndarray,
parameters: dict[str, np.ndarray],
) -> tuple[np.ndarray, np.ndarray]:
# 가중치와 편향 추출
W1 = parameters["W1"]
b1 = parameters["b1"]
W2 = parameters["W2"]
b2 = parameters["b2"]
# 입력층에서 히든레이어까지의 연산
Z1 = np.dot(W1, X) + b1
A1 = leaky_relu(Z1)
# 히든레이어에서 출력층까지의 연산
Z2 = np.dot(W2, A1) + b2
A2 = sigmoid(Z2)
return A1, A2
def backward_propagation(
parameters: dict[str, np.ndarray],
A1: np.ndarray,
A2: np.ndarray,
X: np.ndarray,
Y: np.ndarray,
) -> dict[str, np.ndarray]:
m = X.shape[1]
W2 = parameters["W2"]
dZ2 = (A2 - Y) * A2 * (1 - A2)
dW2 = np.dot(dZ2, A1.T) / m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dZ1 = np.dot(W2.T, dZ2) * (A1 > 0)
dW1 = np.dot(dZ1, X.T) / m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
gradients = {
"dW1": dW1,
"db1": db1,
"dW2": dW2,
"db2": db2
}
return gradients
def update_parameters(
parameters: dict[str, np.ndarray],
grads: dict[str, np.ndarray],
learning_rate: float = 6.5,
) -> dict[str, np.ndarray]:
parameters["W1"] -= learning_rate * grads["dW1"]
parameters["b1"] -= learning_rate * grads["db1"]
parameters["W2"] -= learning_rate * grads["dW2"]
parameters["b2"] -= learning_rate * grads["db2"]
return parameters
# XOR 문제에 대한 입력과 출력 정의
inputs = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]).T
outputs = np.array([0, 1, 1, 0])
# 파라미터 초기화와 순전파 실행
parameters = initialize_parameters()
predicted_outputs = forward_propagation(inputs, parameters)[1]
# 예측 결과 출력
display_results(inputs, predicted_outputs)
# 200000 Steps, 모델 학습
for i in range(200000):
A1, A2 = forward_propagation(inputs, parameters)
grads = backward_propagation(parameters, A1, A2, inputs, outputs)
parameters = update_parameters(parameters, grads)
loss = compute_loss(outputs, A2)
if i > 0 and i % 10000 == 0:
print(f"{i=}, {loss=}")
predicted_outputs = forward_propagation(inputs, parameters)[1]
print(predicted_outputs)
display_results(inputs, predicted_outputs)
i=10000, loss=0.4365236308924095
i=20000, loss=0.4027903318051978
i=30000, loss=0.3810628217006238
i=40000, loss=0.3659018035794712
i=50000, loss=0.3545317176017809
i=60000, loss=0.3480514836067102
i=70000, loss=0.3401866977059804
i=80000, loss=0.32698877237703583
i=90000, loss=0.3220346350622486
i=100000, loss=0.3169357345859798
i=110000, loss=0.3126991386147929
i=120000, loss=0.3132169521074427
i=130000, loss=0.31970056962151605
i=140000, loss=0.30925423788218825
<ipython-input-75-55413103712b>:38: RuntimeWarning: divide by zero encountered in log
loss = -np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)) / m
<ipython-input-75-55413103712b>:38: RuntimeWarning: invalid value encountered in multiply
loss = -np.sum(Y * np.log(Y_hat) + (1 - Y) * np.log(1 - Y_hat)) / m
i=150000, loss=0.2984823303547125
i=160000, loss=0.3178656103270632
i=170000, loss=0.29718347070944096
i=180000, loss=0.3095699420136674
i=190000, loss=nan
[[0.02019567 1. 0.44362416 0.51593884]]
Input (A, B) | Predicted Y
---------------------------
0, 0 | 0
0, 1 | 1
1, 0 | 0
1, 1 | 1
Reference: 개발자를 위한 MLOps : 추천 시스템 구축부터 최적화까지(FastCampus)