[AI] CUDA out of memory...

늘 공부하는 괴짜·2025년 7월 7일

cuda memory oom

AI : Langchain (RAG)

목록 보기

39/39

1. VRAM 이 적으면...

4090 이나 5090 같은 24기가, 32기가 VRAM 이 탑재되어 있다면 메모리의 속박에서 조금 더 자유로울 것이다 하지만...

2. 끼야야야약! ㅠ

아니 뭐 별로 한것도 없다고...

torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 5.16 GiB. GPU 0 has a total capacity of 15.48 GiB of which 3.58 GiB is free. Including non-PyTorch memory, this process has 11.87 GiB memory in use. Of the allocated memory 8.11 GiB is allocated by PyTorch, and 3.60 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

3. GPT 에게 물어보자.

CUDA 메모리 잡아먹는 공식은 다음과 같다.

(모델 파라미터 크기) + (입력 토큰 길이 × 모델 크기에 비례하는 활성화 메모리) × batch size

4. 계산하는 메서드 구현

# 대충 CUDA 메모리를 이정도 먹는다고 계산한다.
def measure_mem(model, tokenizer, text):
    # Python garbage collector 호출 (불필요한 객체 제거)
    gc.collect()
    # CUDA 캐시 메모리 초기화 (이전 작업의 잔여 메모리 제거)
    torch.cuda.empty_cache()
    
    # 텍스트를 토크나이즈하고 모델 디바이스로 이동
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
    
    # 모델 실행 전 GPU 메모리 사용량 측정
    before = torch.cuda.memory_allocated()
    
    # 그래디언트 계산 없이 모델 실행 (inference-only)
    with torch.no_grad():
        _ = model(input_ids)
    
    # 모델 실행 후 GPU 메모리 사용량 측정
    after = torch.cuda.memory_allocated()
    
    # 사용한 메모리 계산 (MB 단위로 변환)
    used = (after - before) / (1024**2)
    
    # 입력된 토큰 수 확인
    token_len = input_ids.shape[1]
    
    # 총 사용 메모리와 토큰당 메모리 출력
    print(f"Tokens: {token_len}, Used Memory: {used:.2f} MB, Per Token: {used/token_len:.4f} MB")

5. 테스트

# Test 1
# model_name = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct"
# input_text = "프랑스의 수도는 어디 인가요? 간.단.하.게 알려주세요." * 100
# Tokens: 1800, Used Memory: 845.32 MB, Per Token: 0.4696 MB

# Test 2
# input_text = "프랑스의 수도는 어디 인가요? 간.단.하.게 알려주세요." * 200
# Tokens: 3600, Used Memory: 1678.57 MB, Per Token: 0.4663 MB

# Test 3
# input_text = "프랑스의 수도는 어디 인가요? 간.단.하.게 알려주세요." * 500
# torch.OutOfMemoryError: CUDA out of memory. Tried to allocate 6.87 GiB. GPU 0 has a total capacity of 15.48 GiB of which 5.33 GiB is free. Including non-PyTorch memory, this process has 10.11 GiB memory in use. Of the allocated memory 9.31 GiB is allocated by PyTorch, and 667.33 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

6. 전체 소스

from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import gc

# 역시... 모델의 크기가 크면 클수록 CUDA 메모리를 많이 사용한다.
# (모델 파라미터 크기) + (입력 토큰 길이 × 모델 크기에 비례하는 활성화 메모리) × batch size
# float32 모델의 입력토큰이 30 정도인 경우 ((240,000,000 * 4) + (30 * 512000)) * 1  = 960,000,000 + 15,360,000 = 975,360,000 bytes
# float16 모델의 입력토큰이 30 정도인 경우 ((240,000,000 * 2) + (30 * 512000)) * 1 = 480,000,000 + 15,360,000 = 495,360,000 bytes
# 대충 CUDA 메모리를 이정도 먹는다고 계산한다.
def measure_mem(model, tokenizer, text):
    # Python garbage collector 호출 (불필요한 객체 제거)
    gc.collect()
    # CUDA 캐시 메모리 초기화 (이전 작업의 잔여 메모리 제거)
    torch.cuda.empty_cache()
    
    # 텍스트를 토크나이즈하고 모델 디바이스로 이동
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
    
    # 모델 실행 전 GPU 메모리 사용량 측정
    before = torch.cuda.memory_allocated()
    
    # 그래디언트 계산 없이 모델 실행 (inference-only)
    with torch.no_grad():
        _ = model(input_ids)
    
    # 모델 실행 후 GPU 메모리 사용량 측정
    after = torch.cuda.memory_allocated()
    
    # 사용한 메모리 계산 (MB 단위로 변환)
    used = (after - before) / (1024**2)
    
    # 입력된 토큰 수 확인
    token_len = input_ids.shape[1]
    
    # 총 사용 메모리와 토큰당 메모리 출력
    print(f"Tokens: {token_len}, Used Memory: {used:.2f} MB, Per Token: {used/token_len:.4f} MB")


# 모델 로드
model_name = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct"

# 모델과 토크나이저를 로드할 때, torch_dtype를 float16으로 설정하고 device_map을 "auto"로 지정
# trust_remote_code=True를 사용하여 외부 코드 실행을 허용
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto", 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    trust_remote_code=True, 
    device_map="auto"
)

# 최대 토큰 크기
MAX_TOKEN_LENGTH = model.config.max_position_embeddings

# 입력 텍스트
input_text = "프랑스의 수도는 어디 인가요? 간.단.하.게 알려주세요."

measure_mem(model, tokenizer, input_text)

# 프롬프트 템플릿
prompt_template = [
    {"role" : "system", "content": "You are a helpful assistant."},
    {"role" : "user", "content": input_text}
]

# 응답 생성을 유도하는 프롬프트 포함 (add_generation_prompt=True)
# **결과를 토크나이즈된 input_ids 등의 텐서(dict)**로 반환 (tokenizer=True)
prompt = tokenizer.apply_chat_template(
    prompt_template,
    tokenize=False,
    add_generation_prompt=True,
    return_tensors="pt"
)

# 컨텍스트의 크기 측정
# print(model.config.max_position_embeddings, "tokens")

# 먼저 inputs를 생성해서 길이 확인
initial_inputs = tokenizer(prompt, return_tensors="pt")

# 입력 토큰의 크기가 MAX_TOKEN_LENGTH 를 초과하는 경우 자르기
# 근데 생각해보니 MAX_TOKEN_LENGTH 를 넘길 정도면 CUDA 메모리가 부족할 가능성이 높다.
# 따라서, 입력 토큰이 MAX_TOKEN_LENGTH를 초과하는 경우 자르도록 설정
if initial_inputs["input_ids"].shape[1] > MAX_TOKEN_LENGTH:
    # 입력 토큰을 자르기 (Truncation=True)
    # 토크나이저가 입력을 자를 때, 최대 길이를 지정(MAX_TOKEN_LENGTH)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_TOKEN_LENGTH).to(model.device)
else:
    # 입력 토큰이 MAX_TOKEN_LENGTH 이하인 경우 그대로 사용
    # 토크나이저가 입력을 자르지 않도록 truncation=False로
    inputs = tokenizer(prompt, return_tensors="pt", truncation=False).to(model.device)

# 모델에 입력 전달 및 응답 생성
with torch.no_grad():
    outputs = model.generate(
        inputs.input_ids,
        max_new_tokens=100,

        # False : 가장 확률이 높은 단어를 선택
        # True : 확률 분포에 따라 무작위로 하나를 "샘플링"
        do_sample=True, 
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id
    )

# 생성된 응답 디코딩
outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# 출력 결과
print("Generated Response:", outputs[0])

늘 공부하는 괴짜

인공지능이라는 옷을 입었습니다. 뭔가 멋지면서도 잘 맞습니다.

이전 포스트

[AI] CUDA out of memory...

AI : Langchain (RAG)

1. VRAM 이 적으면...

2. 끼야야야약! ㅠ

3. GPT 에게 물어보자.

(모델 파라미터 크기) + (입력 토큰 길이 × 모델 크기에 비례하는 활성화 메모리) × batch size

4. 계산하는 메서드 구현

5. 테스트

6. 전체 소스

[AI] mongodb 로 RAG 구현해보자.

0개의 댓글