LLM 트랜스포머

JooHeon·2025년 2월 27일

토큰화 : 텍스트를 적절한 단위로 나누고 숫자 아이디를 부여하는 것
임베딩 : 토큰의 의미를 담기 위해 최소 2개 이상의 숫자 집합인 벡터로 변환시키는 것

# 파이토치로 토큰 임베딩 생성

input_text_list = "나는 최근 파리 여행을 다녀왔다".split()
str2idx = {word:idx for idx, word in enumerate(input_text_list)}
input_ids = [str2idx[word] for word in input_text_list]

# 토큰 하나를 16 차원의 벡터로 변환하는 임베딩 층 생성
embed_layer = nn.Embedding(len(str2idx), 16)
# 토큰을 토큰 아이디로 변환
input_ids = [str2idx[word] for word in input_text_list]
# 토큰 임베딩 생성 
# tensor는 다차원 배열을 의미하며 NumPy의 ndarray와 비슷하지만 GPU 연산이 가능
# unsqueeze(dim)는 특정 차원에 크기가 1인 새로운 차원을 추가
token_embeddings = embed_layer(torch.tensor(input_ids)).unsqueeze(0) # (1, 5, 16)

절대적 위치 인코딩 : 입력 토큰의 위치에 따라 고정된 임베딩을 가산
RNN과 다르게 트랜스포머는 모든 입력을 동시에 처리하기 때문에 텍스트의 순서를 알기위해 위치 인코딩이 필요하다

# 최대 토큰 수가 12인 위치 인코딩 층 생성
position_embed_layer = nn.Embedding(12, embedding_dim)

# 입력 토큰의 수까지 1씩 증가하는 위치 아이디를 생성
position_ids = torch.arange(len(input_ids), dtype=torch.long).unsqueeze(0)
# 절대적 위치 인코딩 생성
position_encodings = position_embed_layer(position_ids)
# 토큰 임베딩과 위치 인코딩을 더해 최종 입력 임베딩 생성
input_embeddings = token_embeddings + position_encodings

쿼리 : 입력하는 검색어
키 : 문서가 가진 특정
값 : 제공할 문서

트랜스포머는 단어간의 관련성을 찾기 위해 토큰 임베딩을 변환하는 가중치를 도입 (어텐션 연산)

# 16 x 16의 선형 층을 생성
weight_q = nn.Linear(16, 16)
weight_k = nn.Linear(16, 16)
weight_v = nn.Linear(16, 16)

# 선형 층을 통과
querys = weight_q(input_embeddings) # (1, 5, 16)
keys = weight_k(input_embeddings) # (1, 5, 16)
values = weight_v(input_embeddings) # (1, 5, 16)

import torch.nn.functional as f

def compute_attention(querys, keys, values):
    dim_k = querys.size(-1)  # 16
    # 유사도(scores)를 계산
    # querys의 마지막 차원과 keys의 마지막에서 앞 차원이 맞아야 하기 때문에 전치 후 내적
    # 내적의 크기(분산)가 너무 커질 수 있기 때문에 제곱근으로 나눠 스케일링
    # (softmax가 극단적인 값을 가지지 않게 하기 위함)
    scores = querys @ keys.transpose(-2, -1) / sqrt(dim_k)
    # 마지막 차원을 기준으로 확률 분포를 생성
    weights = f.softmax(scores, dim=-1)
    return weights @ values

트랜스포머에서는 토큰 사이의 관계를 여러 측면을 동시에 고려할 때 언어나 문장에 대한 이해도가 올라가기 때문에 멀티 헤드 어텐션을 도입

class MultiheadAttention(nn.Module):
    def __init__(self, token_embed_dim, d_model, n_head):
        super().__init__()
        self.n_head = n_head
        self.weight_q = nn.Linear(token_embed_dim, d_model)
        self.weight_k = nn.Linear(token_embed_dim, d_model)
        self.weight_v = nn.Linear(token_embed_dim, d_model)
        self.concat_linear = nn.Linear(d_model, d_model)
        
    def forward(n_head, querys, keys, values):
    	# B : 배치 사이즈 (1)
        # T : 시퀀스 길이 (5)
        # C : 차원(16)
        B, T, C = querys.size()
        # (batch, seq_len, n_head, head_dim)의 순서로 head_dim의 요소들이 메모리상 연속적으로 저장됨
        # 헤드 별 독립적인 계산이 가능하도록 2번째 차원(T)과 3번째 차원(n_head)을 전치
        querys = self.weight_q(querys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        keys = self.weight_k(keys).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        values = self.weight_v(values).view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        attention = compute_attention(querys, keys, values)
        # view 전에 메모리에서 텐서 요소들이 연속적으로 저장
        # C는 n_head * head_dim이기 때문에 (batch, seq_len, n_head, head_dim) 순서로 다시 전치
        output = attention.transpose(1, 2).contiguous().view(B, T, C)
        # 가중치 행렬과 편향 벡터로 이루어져있는 선형 계층과 내적
        output = self.concat_linear(output)
        return output

# 층 정규화 코드
norm = nn.LayerNorm(embedding_dim)
norm_x = norm(input_embeddings)
norm_x.shape # torch.Size([1, 5, 16])

norm_x.mean(dim=-1).data, norm_x.std(dim=-1).data

# (tensor([[ 2.2352e-08, -1.1176e-08, -7.4506e-09, -3.9116e-08, -1.8626e-08]]),
#  tensor([[1.0328, 1.0328, 1.0328, 1.0328, 1.0328]]))

피드 포워드 층 : 데이터의 특징을 학습하는 완전 연결 층

class PreLayerNormFeedForward(nn.Module):
  def __init__(self, d_model, dim_feedforward, dropout):
    super().__init__()
    # dim_feedforward로 차원을 확장
    self.linear1 = nn.Linear(d_model, dim_feedforward)
    # 확장된 dim_feedforward를 다시 d_model 차원으로 줄임
    self.linear2 = nn.Linear(dim_feedforward, d_model)
    선형 변환 후 과적합 방지와 안정성 증가를 위해 2번의 드랍아웃 
    self.dropout1 = nn.Dropout(dropout)
    self.dropout2 = nn.Dropout(dropout)
    # 비선형성을 추가해서 더 복잡하고 다양한 패턴을 학습
    self.activation = nn.GELU() # 활성 함수
    self.norm = nn.LayerNorm(d_model) # 층 정규화

  def forward(self, src):
    x = self.norm(src)
    x = x + self.linear2(self.dropout1(self.activation(self.linear1(x))))
    x = self.dropout2(x)
    return x

인코더 : 안정적인 학습이 가능하도록 반복적으로 입력을 더하는 잔차연결을 수행
디코더 : 순차적으로 출력을 생성하기 위해 마스크 멀티 헤드 어텐션을 수행하여 앞에서 생성한 토큰을 기반으로 다음 토큰을 생성

JooHeon

이전 포스트

LLM 관련 용어 이해

다음 포스트

LLM 트랜스포머

LLM 관련 용어 이해

허깅페이스 트랜스포머

0개의 댓글