import random

class Nim():
    def __init__(self, initial=[4, 4, 4, 4]):
        self.piles = initial.copy()
        self.player = 0  # Player 0 starts
        self.winner = None

    @classmethod
    def available_actions(cls, piles):
        actions = set()
        for i, pile in enumerate(piles):
            for j in range(1, pile + 1):
                actions.add((i, j))
        return actions

    @classmethod
    def other_player(cls, player):
        return 0 if player == 1 else 1

    def switch_player(self):
        self.player = Nim.other_player(self.player)

    def move(self, action):
        pile, count = action
        self.piles[pile] -= count
        self.switch_player()
        if all(pile == 0 for pile in self.piles):
            self.winner = self.player


class NimAI():
    def __init__(self, alpha=0.5, epsilon=1):
        self.q = dict()  # Q-value table
        # self.q[(0, 0, 0, 2), (3, 2)] = -1 # Test Q-Value 
        # self.q[(0, 0, 0, 2), (3, 1)] = 10 # Test Q-Value 
        
        self.q[((1,1,1,0), (0,1))] = 0.4
        self.q[((1,1,1,0), (1,1))] = 0.9
        self.q[((1,1,1,0), (2,1))] = 0.7
        self.q[((2,1,1,0), (0,1))] = 0.2

        
        self.alpha = alpha  # Learning rate
        self.epsilon = epsilon  # Exploration rate

    def update(self, old_state, action, new_state, reward):
        old_q = self.get_q_value(old_state, action)
        best_future_q = self.best_future_reward(new_state)
        self.update_q_value(old_state, action, old_q, reward, best_future_q)

    def get_q_value(self, state, action):
        """
    Return the Q-value for a given state-action pair.
    
    Parameters:
        state (list): The current game state.
        action (tuple): The action being evaluated.

    Returns:
        float: The Q-value associated with the (state, action) pair. 
               Returns 0 if the pair is not yet in the Q-table.
    """
        print(self.q, state, action)
        try:
            return self.q[(tuple(state), action)]
        except:
            return 0

    def update_q_value(self, state, action, old_q, reward, future_q):
        """
    Update the Q-value for a state-action pair using the Q-learning formula.

    Q(s, a) ← Q(s, a) + α * (Belohnung + γ * max_a' Q(s', a') - Q(s, a))
    
    Parameters:
        state (list): The current game state.
        action (tuple): The action taken.
        old_q (float): The previous Q-value for the (state, action) pair.
        reward (float): The reward received after taking the action.
        future_q (float): The maximum Q-value for the next state.
    """
        self.q[tuple(state), action] = old_q + self.alpha * (reward + self.epsilon * future_q - old_q)
        return 0
    
    def best_future_reward(self, state):
        """
    Determine the highest Q-value among all possible actions in a given state.
    
    Parameters:
        state (list): The state for which to compute the best future reward.
        
    Returns:
        float: The highest Q-value among available actions. 
               Returns 0 if no actions are available.
    """
        # actions = []
        # for q in self.q.key:
        #     if q[0] == state:
        #         actions.append(q[1])
        actions = tuple([key[1] for key in self.q.keys() if key[0] == state])
        try:
            return max([q for q in self.q[tuple(state), actions]])
        except:
            return 0

    def choose_action(self, state, epsilon=True):
        """
    Choose an action for the given state using an epsilon-greedy strategy.
    
    Parameters:
        state (list): The current game state.
        epsilon (bool): If True, use epsilon-greedy exploration; otherwise, choose the best action.
    
    Returns:
        tuple: The chosen action from the available actions.
    """
        if epsilon:
            return random.choice(tuple(Nim.available_actions(state)))
            # keys = [key[1] for key in self.q.key if key[0] == state]
            # for key in keys:
        else:
            try:
                return max([key[1] for key in self.q.keys() if key[0] == state])
            except:
                return (0,0)

def train(n):
    player = NimAI()

    for i in range(n):
        game = Nim([4, 4, 4, 4])
        last_move = {0: {"state": None, "action": None}, 1: {"state": None, "action": None}}

        while True:
            state = game.piles.copy()
            action = player.choose_action(state)
            last_move[game.player]["state"] = state
            last_move[game.player]["action"] = action

            game.move(action)
            new_state = game.piles.copy()

            if game.winner is not None:
                player.update(state, action, new_state, -1)
                player.update(last_move[game.player]["state"], last_move[game.player]["action"], new_state, 1)
                break
            elif last_move[game.player]["state"] is not None:
                player.update(last_move[game.player]["state"], last_move[game.player]["action"], new_state, 0)

    return player