Reinforcement Learning Example - Tic Tac Toe#

import random
import numpy as np
import matplotlib.pyplot as plt
import copy
n_states = 9*8*7*6*5*4*3*2
state_values = np.zeros(n_states)
p2_state_values = np.zeros(n_states)
# outer loop is the number of sets of inner loops to run
# inner loop is just to print out stats for the given number of
# games at a time
outer_loops = 100 # random training
outer_model_loops = 100 # training vs model
inner_loop = 1000
test_games = 100  # games to test after training inner loop
learning_rate = 0.15
win_score = 1
lose_score = -1
draw_score = 0.9
learn_while_testing = True
debug = False
# helper function to return the state id given a sequence of moves
def state_id(moves):
    coef = 1
    count = 0
    result = 0
    all_moves = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    for id in moves:
        idx = all_moves.index(id)
        all_moves.pop(idx)
        result += coef * idx
        coef *= (9-count)
        count += 1

    return result
def get_valid_moves(state):
    all_moves = [0, 1, 2, 3, 4, 5, 6, 7, 8]
    return list(set(all_moves) - set(state))

# assumes board is labeled like
#   0  |  1  |  2
#   --------------
#   3  |  4  |  5
#   --------------
#   6  |  7  |  8
#
# Player 1 moves are in the even indexes (0,2,4,6,8)
# Player 2 moves are in the odd indexes (1,3,5,7)
#
# returns:
#   0 -- no winner
#   1 -- player 1 wins
#   2 -- player 2 wins
#   3 -- draw


def check_for_winner(state):
    result = 0
    win1 = set([0, 1, 2])
    win2 = set([3, 4, 5])
    win3 = set([6, 7, 8])
    win4 = set([0, 3, 6])
    win5 = set([1, 4, 7])
    win6 = set([2, 5, 8])
    win7 = set([0, 4, 8])
    win8 = set([2, 4, 6])

    p1_moves = set(state[::2])
    p2_moves = set(state) - p1_moves

    # this could be simpliefied as a loop ...
    if (win1.issubset(p1_moves) or win2.issubset(p1_moves) or win3.issubset(p1_moves) or win4.issubset(p1_moves) or win5.issubset(p1_moves) or win6.issubset(p1_moves) or win7.issubset(p1_moves) or win8.issubset(p1_moves)):

        result = 1

    elif (win1.issubset(p2_moves) or
          win2.issubset(p2_moves) or
          win3.issubset(p2_moves) or
          win4.issubset(p2_moves) or
          win5.issubset(p2_moves) or
          win6.issubset(p2_moves) or
          win7.issubset(p2_moves) or
          win8.issubset(p2_moves)):

        result = 2

    elif len(state) == 9:
        result = 3

    return result

def update_values(state, learning_rate, debug=False):

    game = copy.deepcopy(state)
    winner = check_for_winner(game)
    if (winner == 1):
        p1_val = win_score
        p2_val = lose_score
    elif (winner == 2):
        p1_val = lose_score
        p2_val = win_score
    else:
        p1_val = draw_score
        p2_val = draw_score

    id = state_id(game)

    if debug:
        print("game: ", game)
        print("winner: ", winner)
        print("p1 val:", p1_val)
        print("p2_val: ", p2_val)
        print("id: ", id)
        print("before state_values: ", state_values[id])
        print("before p2 state value: ", p2_state_values[id])
        
    # final state gets value assigned. Either win or draw
    state_values[id] = p1_val
    p2_state_values[id] = p2_val
    
    prev_val = p1_val
    p2_prev_val = p2_val

    # rest of values are defined as
    # v(s_n) = v(s_n) + learning_rate * (v(s_n+1) - v(s_n))
    while len(game) > 1:
        game.pop()
        id = state_id(game)
        cur_val = state_values[id]
        p2_cur_val = p2_state_values[id]
        state_values[id] += learning_rate*(prev_val - cur_val)
        p2_state_values[id] += learning_rate*(p2_prev_val - p2_cur_val)
        if debug:
            print("game: ", game)
            print("id: ", id)
            print("prev val:", prev_val)
            print("p2 prev val: ", p2_prev_val)
            print("cur val:", cur_val)
            print("p2 cur val: ", p2_cur_val)
            print("after state_values: ", state_values[id])
            print("after p2 state value: ", p2_state_values[id])
        prev_val = state_values[id]
        p2_prev_val = p2_state_values[id]

def random_move(state, force_win=False):

    # play winning move if there is one otherwise play randomly
    valid_moves = get_valid_moves(state)
    move_id = -1
    if force_win:
        for i in range(0, len(valid_moves)):
            test_move = copy.deepcopy(state)
            test_move.append(valid_moves[i])
            if (check_for_winner(test_move) != 0):
                move_id = i
                break
    if move_id == -1:
        move_id = random.randint(0, len(valid_moves) - 1)
    state.append(valid_moves[move_id])


def get_scores(state):
    valid_moves = get_valid_moves(state)

    # compute values of each move
    scores = []
    for move in valid_moves:
        test_state = copy.deepcopy(state)
        test_state.append(move)
        id = state_id(test_state)
        if (len(state) % 2 == 0):
            score = state_values[id]
        else:
            score = p2_state_values[id]
        scores.append(score)
    return scores
def model_move(state):

    scores = get_scores(state)
    valid_moves = get_valid_moves(state)
    move_id = np.argmax(scores)
    state.append(valid_moves[move_id])
def mcts_move(state, iters=25):

    # just allocate max space, it would certainly be smarter to
    # set up a real tree ...
    mcts_scores = np.zeros(n_states)
    mcts_visits = np.zeros(n_states)
    mcts_wins = np.zeros(n_states)

    c = np.sqrt(2.0)

    # play requested iterations
    for i in range(0,iters):

        # play out unvisited moves
        test_game = copy.deepcopy(state)
        node_id = -1
        p_id = state_id(test_game)
        valid_moves = get_valid_moves(test_game)

        while check_for_winner(test_game) == 0:

            # check for unvisited moves at this level
            unvisited_move = False
            for move in valid_moves:
                test_move = copy.deepcopy(test_game)
                test_move.append(move)
                id = state_id(test_move)
                if mcts_visits[id] == 0:
                    test_game = copy.deepcopy(test_move)
                    # randomly play out game
                    while (check_for_winner(test_game) == 0):
                        random_move(test_game)
                    unvisited_move = True
                    node_id = id
                    break
            
            # compute scores for each possible move
            if not unvisited_move:
                parent_n = np.log(mcts_visits[p_id])
                scores = []
                for move in valid_moves:
                    test_move = copy.deepcopy(test_game)
                    test_move.append(move)
                    id = state_id(test_move)
                    score = mcts_wins[id] / mcts_visits[id] + c*np.sqrt(parent_n / mcts_visits[id])
                    scores.append(score)
                # select move with highest score
                id = np.argmax(scores)
                test_game.append(valid_moves[id])
                p_id = state_id(test_game)
                valid_moves = get_valid_moves(test_game)

        winner = check_for_winner(test_game)
        win = 0
        score = 0
        if (winner == 3):
            score = draw_score
        elif (len(state) % 2 + 1 == winner):
            score = win_score
            win = 1
        else:
            score = lose_score
  
        
        # update vals
        id = state_id(state)
        mcts_scores[id] += score
        mcts_visits[id] += 1
        mcts_wins[id] += win

        test_move = copy.deepcopy(state)
        done = False
        for j in range(len(state), len(test_game)):
            if done:
                break
            test_move.append(test_game[j])
            id = state_id(test_move)
            if mcts_visits[id] == 0:
                done = True
            mcts_scores[id] += score
            mcts_visits[id] += 1
            mcts_wins[id] += win

    # update state with highest score
    valid_moves = get_valid_moves(state)
    scores = []
    # print("state: ", state)
    for move in valid_moves:
        test_game = copy.deepcopy(state)
        test_game.append(move)
        id = state_id(test_game)
        scores.append(mcts_scores[id]/mcts_visits[id])
        # print("move: ", move)
        # print("\tmcts_score: ", mcts_scores[id])
        # print("\tmcts_visits: ", mcts_visits[id])
        # print("\tmcts_wins: ", mcts_wins[id])
        # print("\tavg mcts_score: ", mcts_scores[id]/mcts_visits[id])
    id = np.argmax(scores)
    state.append(valid_moves[id])
def play_games(n_games, learning_rate=0.15, update_model=False, p1_mode='random', p2_mode='random'):

    p1_wins = 0
    p2_wins = 0
    draws = 0
    for i in range(0,n_games):

        state = []
        while (check_for_winner(state) == 0):

                # Player 1 move
                if p1_mode=='random':
                    random_move(state)
                elif p1_mode=='model':
                    model_move(state)
                elif p1_mode=='mcts':
                    mcts_move(state)

                # stop if p1 won
                if check_for_winner(state) != 0:
                    break

                # player 2 move
                if p2_mode=='random':
                    random_move(state)
                elif p2_mode=='model':
                    model_move(state)
                elif p2_mode=='mcts':
                    mcts_move(state)

        if update_model:
            update_values(state, learning_rate)

        winner = check_for_winner(state)
        if (winner == 1):
            p1_wins += 1
        elif (winner == 2):
            p2_wins += 1
        else:
            draws += 1

    return p1_wins, p2_wins, draws
def print_move_sequence(state):
    print("state: ", state)
    for i in range(0, len(state)):

        print("Move ", i+1)
        chars = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
        next_play = 'X'
        for play in state[0:i+1]:
            chars[play] = next_play
            if (next_play == 'X'):
                next_play = 'O'
            else:
                next_play = 'X'

        # print the scores and the next state
        scores = get_scores(state[0:i])
        score_chars = [' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
        next_play = '  X '
        for play in state[0:i]:
            score_chars[play] = next_play
            if (next_play == '  X '):
                next_play = '  O '
            else:
                next_play = '  X '
        moves = get_valid_moves(state[0:i])
        for j in range(0, len(moves)):
            id = moves[j]
            score_chars[id] = "{:.2f}".format(scores[j])

        print(" %s | %s | %s \t %s | %s | %s " % (
                score_chars[0], score_chars[1], score_chars[2], chars[0], chars[1], chars[2]))
        print("------------------- \t -----------")
        print(" %s | %s | %s \t %s | %s | %s " % (
                score_chars[3], score_chars[4], score_chars[5], chars[3], chars[4], chars[5]))
        print("------------------- \t -----------")
        print(" %s | %s | %s \t %s | %s | %s " % (
                score_chars[6], score_chars[7], score_chars[8], chars[6], chars[7], chars[8]))
        print("")
p1_wins_v_random = []
p2_wins_v_random = []
p1_losses_v_random = []
p2_losses_v_random = []
training_games = []

# Train and test 
for outer in range(0, outer_loops):

    # play random games and update model
    play_games(inner_loop, learning_rate, update_model=True)

    training_games.append((outer+1)*inner_loop)

    # test with player 2 using model, player 1 random
    player_1_wins, player_2_wins, draws = play_games(test_games, learning_rate, update_model=learn_while_testing, p1_mode='random', p2_mode='model')

    p2_wins_v_random.append(player_2_wins)
    p2_losses_v_random.append(player_1_wins)

    # test with player 1 using model, player 2 random
    player_1_wins, player_2_wins, draws = play_games(test_games, learning_rate, update_model=learn_while_testing, p1_mode='model', p2_mode='random')

    p1_wins_v_random.append(player_1_wins)
    p1_losses_v_random.append(player_2_wins)
# Train and test using model
for outer in range(0, outer_model_loops):

    # play random games and update model
    play_games(inner_loop, learning_rate, update_model=True)

    # train with player 2 using model, player 1 random
    play_games(inner_loop, learning_rate, update_model=True, p1_mode='random',  p2_mode='model')
    # train with player 1 using model, player 2 random
    play_games(inner_loop, learning_rate, update_model=True, p1_mode='model', p2_mode='random')

    training_games.append(outer_loops*inner_loop + 2*(outer+1)*inner_loop)

    # test with player 2 using model, player 1 mcts
    player_1_wins, player_2_wins, draws = play_games(test_games, learning_rate, update_model=learn_while_testing, p1_mode='random', p2_mode='model')

    p2_wins_v_random.append(player_2_wins)
    p2_losses_v_random.append(player_1_wins)

    # test with player 1 using model, player 2 mcts
    player_1_wins, player_2_wins, draws = play_games(test_games, learning_rate, update_model=learn_while_testing, p1_mode='model', p2_mode='random')

    p1_wins_v_random.append(player_1_wins)
    p1_losses_v_random.append(player_2_wins)
# try to visualize a loss
for i in range(0, test_games):

    # player 1 plays randomly
    state = []

    while (check_for_winner(state) == 0):

        # Just play completely randomly
        random_move(state)

        # stop if p1 won
        if check_for_winner(state) != 0:
            break

        # player 2 uses model
        model_move(state)

    winner = check_for_winner(state)
    if winner == 1:
        print_move_sequence(state)
        update_values(state, learning_rate, True)
        break
if len(p1_wins_v_random) > 0:
    fig, ax = plt.subplots()
    line1, = ax.plot(np.array(training_games) / 1000, np.array(p1_wins_v_random) /
                    test_games * 100, 'g-', label='Player 1 wins vs Random')
    line2, = ax.plot(np.array(training_games) / 1000, np.array(p2_wins_v_random) /
                    test_games * 100, 'b-', label='Player 2 wins vs Random')
    line3, = ax.plot(np.array(training_games) / 1000, np.array(p1_losses_v_random) /
                    test_games * 100, 'g--', label='Player 1 losses vs Random')
    line4, = ax.plot(np.array(training_games) / 1000, np.array(p2_losses_v_random) /
                    test_games * 100, 'b--', label='Player 2 losses vs Random')
    ax.set_xlabel('Random Training Games (1000s)')
    ax.set_ylabel('Win Percentage')
    ax.set_title('Model Performance Based on Training Size')
    ax.legend()
    plt.show()