Reference : https://pytorch.org/torchtune/stable/tutorials/lora_finetune.html
A
and B
matrices만 학습을 진행함.in_dim*out_dim
에서 r*(in_dim+out_dim)
으로 획기적으로 줄여줌 (r
은 매우 작음 in_dim
이랑 out_dim
이랑 비교했을 때는)in_dim=out_dim=4096
.r=8
로 decomposition하면 4096*4096=15M→8*8192=65K.실제 코드단에서는 다음과 같이 적용됨.
from torch import nn, Tensor
class LoRALinear(nn.Module):
def __init__(
self,
in_dim: int,
out_dim: int,
rank: int,
alpha: float,
dropout: float
):
# These are the weights from the original pretrained model
self.linear = nn.Linear(in_dim, out_dim, bias=False)
# These are the new LoRA params. In general rank << in_dim, out_dim
self.lora_a = nn.Linear(in_dim, rank, bias=False)
self.lora_b = nn.Linear(rank, out_dim, bias=False)
# Rank and alpha are commonly-tuned hyperparameters
self.rank = rank
self.alpha = alpha
# Most implementations also include some dropout
self.dropout = nn.Dropout(p=dropout)
# The original params are frozen, and only LoRA params are trainable.
self.linear.weight.requires_grad = False
self.lora_a.weight.requires_grad = True
self.lora_b.weight.requires_grad = True
def forward(self, x: Tensor) -> Tensor:
# This would be the output of the original model
frozen_out = self.linear(x)
# lora_a projects inputs down to the much smaller self.rank,
# then lora_b projects back up to the output dimension
lora_out = self.lora_b(self.lora_a(self.dropout(x)))
# Finally, scale by the alpha parameter (normalized by rank)
# and add to the original model's outputs
return frozen_out + (self.alpha / self.rank) * lora_out
# Print the first layer's self-attention in the usual Llama2 model
>>> print(base_model.layers[0].attn)
CausalSelfAttention(
(q_proj): Linear(in_features=4096, out_features=4096, bias=False)
(k_proj): Linear(in_features=4096, out_features=4096, bias=False)
(v_proj): Linear(in_features=4096, out_features=4096, bias=False)
(output_proj): Linear(in_features=4096, out_features=4096, bias=False)
(pos_embeddings): RotaryPositionalEmbeddings()
)
# Print the same for Llama2 with LoRA weights
>>> print(lora_model.layers[0].attn)
CausalSelfAttention(
(q_proj): LoRALinear(
(dropout): Dropout(p=0.0, inplace=False)
(lora_a): Linear(in_features=4096, out_features=8, bias=False)
(lora_b): Linear(in_features=8, out_features=4096, bias=False)
)
(k_proj): Linear(in_features=4096, out_features=4096, bias=False)
(v_proj): LoRALinear(
(dropout): Dropout(p=0.0, inplace=False)
(lora_a): Linear(in_features=4096, out_features=8, bias=False)
(lora_b): Linear(in_features=8, out_features=4096, bias=False)
)
(output_proj): Linear(in_features=4096, out_features=4096, bias=False)
(pos_embeddings): RotaryPositionalEmbeddings()
)
위에서 볼 수 있듯이, 기존의 matrix에 해당하는 4096 * 4096의 pretrained weight를 고정하고, 학습가능한 low-rank linear A, B matrix를 사용하여 이들을 학습함으로써 weight를 업데이트(?)할 수 있는 를 찾아냄.