LLM 학습시키기

Apic·어제

코딩

목록 보기

26/28

시작

저는 기존에 소설 데이터를 크롤링해서 가지고 있는 데이터로 준비했습니다.

데이터 구성은 아래와 같습니다.

제목
작가 이름
소개글
장르
태그

여기서 태그가 있는 작품이 있고, 없는 작품이 있습니다.
저는 태그가 없는 작품에 태그를 붙이기 위해 기존에 태그가 있는 작품들을 LLM에 학습시켜서 태그를 붙이려고 합니다.

학습 환경

구성	버전
OS	Ubuntu 22.04
Python	3.11.11
GPU	RTX 3060 12GB
CUDA	12.5
cudnn	9.0.3
LLM	torchtorchkimtorch/Llama-3.2-Korean-GGACHI-1B-Instruct-v1

참고 자료

파이토치 - cuda
LLM - Llama-3.2-Korean-GGACHI-1B-Instruct-v1

지금은 어차피 태그만 만들기 때문에 1B 모델로 시도했다.

cuda 12.5

cuda 버전 확인
nvcc -V

  nvcc: NVIDIA (R) Cuda compiler driver
  Copyright (c) 2005-2024 NVIDIA Corporation
  Built on Wed_Apr_17_19:19:55_PDT_2024
  Cuda compilation tools, release 12.5, V12.5.40
  Build cuda_12.5.r12.5/compiler.34177558_0

cudnn 9.3.0

cudnn 버전 확인
cat /usr/include/cudnn_version.h | grep CUDNN_MAJOR -A 2
#define CUDNN_MAJOR 9
#define CUDNN_MINOR 3
#define CUDNN_PATCHLEVEL 0
--
#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

데이터 준비

데이터는 네이버, 카카오, 노벨피아, 문피아에서 크롤링한 데이터 약 12만개입니다.

전처리 이후 csv 파일로 준비했습니다.

태그가 있는 작품: 약 11만개
태그가 없는 작품: 약 19만개

데이터 구조

import os
import pandas as pd
import pickle
from datasets import  Dataset

# 데이터 불러오기
def load_data(path:str):
    data = pd.read_pickle(path)
    return data

# 데이터 저장하기
def save_data(data, path:str):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

# 태그 기준으로 분류하기
def split_data_by_tags(df):
    # 결측치나 공백 포함 빈 문자열 처리
    clean_keywords = df['keywords'].fillna('').astype(str).str.strip()

    # train: 키워드가 존재하는 경우
    train_df = df[clean_keywords != '']

    # test: 키워드가 비어 있거나 무의미한 경우
    test_df = df[clean_keywords == '']

    return train_df, test_df

# 프롬프트 생성 함수 (datasets.map에 적용할 함수)
def create_prompt(example):
    """
    데이터셋의 각 항목(example)을 받아 프롬프트 형식으로 변환합니다.
    """
    prompt = f"""당신은 소설 정보를 분석하여 핵심 태그를 생성하는 전문가입니다.

### 지시사항
아래 작품 정보에 가장 적합한 태그를 쉼표로 구분된 목록으로 반환하세요. 다른 설명 없이 태그 목록만 생성해야 합니다.

### 작품 정보
- 제목: {example['title']}
- 작가: {example['author']}
- 장르: {example['genre']}
- 소개글: {example['summary']}

### 태그
{example['keywords']}
"""
    # 모델 학습에 사용할 최종 텍스트를 'text'라는 새 컬럼에 저장
    return {"text": prompt}

if __name__ == '__main__':
    output_dir = 'ai/data'
    if not os.path.exists(output_dir):
        print(f'{output_dir} 폴더가 없습니다. 생성합니다.')
        os.makedirs(output_dir)

    # 1. 원본 데이터 로드 및 분리
    data_path = 'datas/all_data.pkl'
    df = load_data(data_path)
    print(f'총 데이터 개수: {df.shape[0]}')

    labeled_df, inference_df = split_data_by_tags(df)
    print(f'학습용 데이터(태그 있음) 개수: {len(labeled_df)}')
    print(f'추론용 데이터(태그 없음) 개수: {len(inference_df)}')

    # 2. 태그가 있는 데이터를 Hugging Face Dataset으로 변환
    dataset = Dataset.from_pandas(labeled_df)

    # 3. .map()을 사용하여 모든 데이터에 프롬프트 형식 빠르게 적용
    #    이 과정은 df.iterrows()보다 수십~수백 배 빠릅니다.
    processed_dataset = dataset.map(
        create_prompt,
        remove_columns=dataset.column_names, # 기존 컬럼은 모두 제거
        num_proc=1  # 이 부분을 추가하여 병렬 처리 비활성화
    )

    # 4. 학습용(train)과 검증용(validation)으로 분할
    final_datasets = processed_dataset.train_test_split(test_size=0.1)
    
    # 5. 최종 데이터셋 저장
    #    save_to_disk: 나중에 바로 load_from_disk로 불러올 수 있는 가장 효율적인 저장 방식
    final_datasets.save_to_disk(os.path.join(output_dir, 'final_training_dataset'))
    
    # 추론용 데이터는 별도로 저장
    inference_df.to_csv(os.path.join(output_dir, 'inference_data.csv'), index=False)
    
    print("\n🎉 데이터셋 준비 및 저장 완료!")
    print(f"최종 학습/검증 데이터셋이 '{os.path.join(output_dir, 'final_training_dataset')}' 폴더에 저장되었습니다.")
    print(final_datasets)

총 데이터 개수: 314568
학습용 데이터(태그 있음) 개수: 116499
추론용 데이터(태그 없음) 개수: 198069
Map: 100%|████████████████████| 116499/116499 [00:05<00:00, 19984.83 examples/s]
Saving the dataset (1/1 shards): 100%|█| 104849/104849 [00:00<00:00, 497618.19 e
Saving the dataset (1/1 shards): 100%|█| 11650/11650 [00:00<00:00, 437701.13 exa

🎉 데이터셋 준비 및 저장 완료!
최종 학습/검증 데이터셋이 'ai/data/final_training_dataset' 폴더에 저장되었습니다.
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 104849
    })
    test: Dataset({
        features: ['text'],
        num_rows: 11650
    })
})

학습

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    EarlyStoppingCallback,
    DataCollatorForLanguageModeling,
    Trainer
)

import torch
from trl import SFTTrainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from datasets import load_from_disk
import os

# wandb 설정
import wandb
wandb.init(project='recommend_site')

# init config setting
wandb.config = {
  "learning_rate": 0.001,
  "epochs": 5,
  "batch_size": 128
}

model_id="torchtorchkimtorch/Llama-3.2-Korean-GGACHI-1B-Instruct-v1"

# 학습 데이터
dataset_path = 'ai/data/final_training_dataset'
processed_datasets = load_from_disk(dataset_path)

print("✅ 저장된 데이터셋 로드 완료!")
print(processed_datasets)

tokenizer = AutoTokenizer.from_pretrained(model_id)

# 패딩 토큰 설정 (많은 모델에서 필요합니다)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


# 3. 데이터셋 토크나이징
# 'text' 컬럼의 모든 텍스트를 모델이 이해하는 숫자(토큰)로 변환합니다.
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=512)

tokenized_datasets = processed_datasets.map(
    tokenize_function, 
    batched=True, # 여러 샘플을 한 번에 처리하여 속도 향상
    num_proc=1 # 이 부분을 추가
)

print("\n✅ 토크나이징 완료!")
print(tokenized_datasets)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.bfloat16, 
    quantization_config=bnb_config, 
    device_map="auto")

model.config.use_cache = False
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)  

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, torch.nn.Linear):  # Linear 레이어 탐색
            names = name.split('.')
            lora_module_names.add(names[-1])  # 마지막 이름만 추가
    return list(lora_module_names)

peft_config = LoraConfig(
    r=128,
    lora_alpha=16,
    target_modules=find_all_linear_names(model),
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

output_dir = 'outputs'
training_args = TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=3,
    gradient_checkpointing=True,
    max_grad_norm=0.3,
    num_train_epochs=1,
    learning_rate=2e-4,
    bf16=True,
    save_total_limit=3,
    logging_steps=10,
    output_dir=output_dir,
    optim="paged_adamw_32bit",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    # max_steps=1000,       # 일부 테스트용
    report_to='wandb',
    metric_for_best_model="eval_loss",  # 가장 좋은 모델을 선택할 기준 (낮을수록 좋음)
)

early_stopping = EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.001)     # 성능 향상으로 간주할 최소 차이값)  # 예: 3번의 evaluation 마다

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets["test"],
    # DataCollator는 배치 단위로 데이터를 준비하는 역할을 합니다.
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

trainer.train()
trainer.save_model(output_dir)

output_dir = os.path.join(output_dir, "llama2_checkpoint")
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)