import random class Nim(): def __init__(self, initial=[4, 4, 4, 4]): self.piles = initial.copy() self.player = 0 # Player 0 starts self.winner = None @classmethod def available_actions(cls, piles): actions = set() for i, pile in enumerate(piles): for j in range(1, pile + 1): actions.add((i, j)) return actions @classmethod def other_player(cls, player): return 0 if player == 1 else 1 def switch_player(self): self.player = Nim.other_player(self.player) def move(self, action): pile, count = action self.piles[pile] -= count self.switch_player() if all(pile == 0 for pile in self.piles): self.winner = self.player class NimAI(): def __init__(self, alpha=0.5, epsilon=1): self.q = dict() # Q-value table # self.q[(0, 0, 0, 2), (3, 2)] = -1 # Test Q-Value # self.q[(0, 0, 0, 2), (3, 1)] = 10 # Test Q-Value self.q[((1,1,1,0), (0,1))] = 0.4 self.q[((1,1,1,0), (1,1))] = 0.9 self.q[((1,1,1,0), (2,1))] = 0.7 self.q[((2,1,1,0), (0,1))] = 0.2 self.alpha = alpha # Learning rate self.epsilon = epsilon # Exploration rate def update(self, old_state, action, new_state, reward): old_q = self.get_q_value(old_state, action) best_future_q = self.best_future_reward(new_state) self.update_q_value(old_state, action, old_q, reward, best_future_q) def get_q_value(self, state, action): """ Return the Q-value for a given state-action pair. Parameters: state (list): The current game state. action (tuple): The action being evaluated. Returns: float: The Q-value associated with the (state, action) pair. Returns 0 if the pair is not yet in the Q-table. """ print(self.q, state, action) try: return self.q[(tuple(state), action)] except: return 0 def update_q_value(self, state, action, old_q, reward, future_q): """ Update the Q-value for a state-action pair using the Q-learning formula. Q(s, a) ← Q(s, a) + α * (Belohnung + γ * max_a' Q(s', a') - Q(s, a)) Parameters: state (list): The current game state. action (tuple): The action taken. old_q (float): The previous Q-value for the (state, action) pair. reward (float): The reward received after taking the action. future_q (float): The maximum Q-value for the next state. """ self.q[tuple(state), action] = old_q + self.alpha * (reward + self.epsilon * future_q - old_q) return 0 def best_future_reward(self, state): """ Determine the highest Q-value among all possible actions in a given state. Parameters: state (list): The state for which to compute the best future reward. Returns: float: The highest Q-value among available actions. Returns 0 if no actions are available. """ # actions = [] # for q in self.q.key: # if q[0] == state: # actions.append(q[1]) actions = tuple([key[1] for key in self.q.keys() if key[0] == state]) try: return max([q for q in self.q[tuple(state), actions]]) except: return 0 def choose_action(self, state, epsilon=True): """ Choose an action for the given state using an epsilon-greedy strategy. Parameters: state (list): The current game state. epsilon (bool): If True, use epsilon-greedy exploration; otherwise, choose the best action. Returns: tuple: The chosen action from the available actions. """ if epsilon: return random.choice(tuple(Nim.available_actions(state))) # keys = [key[1] for key in self.q.key if key[0] == state] # for key in keys: else: try: return max([key[1] for key in self.q.keys() if key[0] == state]) except: return (0,0) def train(n): player = NimAI() for i in range(n): game = Nim([4, 4, 4, 4]) last_move = {0: {"state": None, "action": None}, 1: {"state": None, "action": None}} while True: state = game.piles.copy() action = player.choose_action(state) last_move[game.player]["state"] = state last_move[game.player]["action"] = action game.move(action) new_state = game.piles.copy() if game.winner is not None: player.update(state, action, new_state, -1) player.update(last_move[game.player]["state"], last_move[game.player]["action"], new_state, 1) break elif last_move[game.player]["state"] is not None: player.update(last_move[game.player]["state"], last_move[game.player]["action"], new_state, 0) return player