강화학습_Monte Carlo 실습

지토·2023년 4월 1일
0

강화학습

목록 보기
4/4
  1. Grid World 초기화
nx, ny =5,5
env = GridworldEnv([ny,nx])
  1. 현재 state 에서 action 을 정의하는 함수.
def get_action(state):
	action = np.random.choice(range(4))
    return action
def run_episode(env, timeout=1000):
	env.reset()
    states = []
    actions = []
    rewards = []
    i = 0
    timeouted = False
    
    while True:
    	state = env.s
        action = get_action(state)
        next_state, reward, done, info = env.step(action)
        states.append(state)
        actions.append(action)
        rewards.append(reward)
        
        if done:
        	break;
        else: 
        	i += 1
            if i >= timeout:
            	timeouted = True
                break
    if not timeouted:
   		episode = (states, actions, rewards)
    return episode
episodes = []
no_episode = 10000
for _ in range(no_episode):
	episodes.append(run_episode(env))

MC

s_v=np.zeros(shape=nx*ny)
s_q = np.zeros(shape=(nx*ny,4))
n_v=np.zeros(shape=nx*ny)
n_q = np.zeros(shape=(nx*ny,4))
gamma=1.0
lr=1e-3

for episode in episodes:
    states, actions, rewards = episode

    # reversing the inputs!
    # for efficient computation of returns
    states = reversed(states)
    actions = reversed(actions)
    rewards = reversed(rewards)
    # 뒤집어서 계산하려고 (끝에서부터 계산하겠다)

    iter = zip(states, actions, rewards)
    cum_r = 0
    for s, a, r in iter:
        cum_r *= gamma
        cum_r += r

        n_v[s] += 1
        n_q[s, a] += 1

        s_v[s] += cum_r
        s_q[s, a] += cum_r
v = s_v / (n_v)
q = s_q / (n_q)

Incremental MC

v=np.zeros(shape=nx*ny)
q = np.zeros(shape=(nx*ny,4))
gamma=1.0
alpha=1e-3

# ------------------- MISSION ---------------------
# 여기에 코드를 작성하세요. 위의 코드를 이용하세요.


for episode in episodes:
    states, actions, rewards = episode

    # reversing the inputs!
    # for efficient computation of returns
    states = reversed(states)
    actions = reversed(actions)
    rewards = reversed(rewards)
    # 뒤집어서 계산하려고 (끝에서부터 계산하겠다)
    cum_r = 0
    iter = zip(states, actions, rewards)

    for s, a, r in iter:
      cum_r += r
      v[s] = v[s] + alpha*(cum_r-v[s])
      q[s,a] = q[s,a] + alpha*(cum_r-q[s,a])

0개의 댓글