nim/nim.py
2026-04-23 10:39:31 +02:00

157 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import random
class Nim():
def __init__(self, initial=[4, 4, 4, 4]):
self.piles = initial.copy()
self.player = 0 # Player 0 starts
self.winner = None
@classmethod
def available_actions(cls, piles):
actions = set()
for i, pile in enumerate(piles):
for j in range(1, pile + 1):
actions.add((i, j))
return actions
@classmethod
def other_player(cls, player):
return 0 if player == 1 else 1
def switch_player(self):
self.player = Nim.other_player(self.player)
def move(self, action):
pile, count = action
self.piles[pile] -= count
self.switch_player()
if all(pile == 0 for pile in self.piles):
self.winner = self.player
class NimAI():
def __init__(self, alpha=0.5, epsilon=1):
self.q = dict() # Q-value table
# self.q[(0, 0, 0, 2), (3, 2)] = -1 # Test Q-Value
# self.q[(0, 0, 0, 2), (3, 1)] = 10 # Test Q-Value
self.q[((1,1,1,0), (0,1))] = 0.4
self.q[((1,1,1,0), (1,1))] = 0.9
self.q[((1,1,1,0), (2,1))] = 0.7
self.q[((2,1,1,0), (0,1))] = 0.2
self.alpha = alpha # Learning rate
self.epsilon = epsilon # Exploration rate
def update(self, old_state, action, new_state, reward):
old_q = self.get_q_value(old_state, action)
best_future_q = self.best_future_reward(new_state)
self.update_q_value(old_state, action, old_q, reward, best_future_q)
def get_q_value(self, state, action):
"""
Return the Q-value for a given state-action pair.
Parameters:
state (list): The current game state.
action (tuple): The action being evaluated.
Returns:
float: The Q-value associated with the (state, action) pair.
Returns 0 if the pair is not yet in the Q-table.
"""
# print(self.q, state, action)
try:
return self.q[(tuple(state), action)]
except:
return 0
def update_q_value(self, state, action, old_q, reward, future_q):
"""
Update the Q-value for a state-action pair using the Q-learning formula.
Q(s, a) ← Q(s, a) + α * (Belohnung + γ * max_a' Q(s', a') - Q(s, a))
Parameters:
state (list): The current game state.
action (tuple): The action taken.
old_q (float): The previous Q-value for the (state, action) pair.
reward (float): The reward received after taking the action.
future_q (float): The maximum Q-value for the next state.
"""
self.q[tuple(state), action] = old_q + self.alpha * (reward + self.epsilon * future_q - old_q)
return 0
def best_future_reward(self, state):
"""
Determine the highest Q-value among all possible actions in a given state.
Parameters:
state (list): The state for which to compute the best future reward.
Returns:
float: The highest Q-value among available actions.
Returns 0 if no actions are available.
"""
state = tuple(state)
# actions = []
# for q in self.q:
# print(q)
# if q[0] == tuple(state):
# actions.append(q[1])
actions = [key[1] for key in self.q if key[0] == state]
# print(actions)
# print(self.q[state, action] for action in actions)
# print(max(self.q[state, action] for action in actions))
return 0 if actions == [] else max(self.q[state, action] for action in actions)
def choose_action(self, state, epsilon=True):
"""
Choose an action for the given state using an epsilon-greedy strategy.
Parameters:
state (list): The current game state.
epsilon (bool): If True, use epsilon-greedy exploration; otherwise, choose the best action.
Returns:
tuple: The chosen action from the available actions.
"""
if epsilon:
return random.choice(tuple(Nim.available_actions(state)))
# keys = [key[1] for key in self.q.key if key[0] == state]
# for key in keys:
else:
# state = tuple(state)
# max(self.q[state, action]) for action in [key[1] for key in self.q if key[0] == state]
# for q in self.q:
# if q[0] == state:
return list(self.q.keys())[list(self.q.values()).index(self.best_future_reward(state))][1]
def train(n):
player = NimAI()
for i in range(n):
game = Nim([4, 4, 4, 4])
last_move = {0: {"state": None, "action": None}, 1: {"state": None, "action": None}}
while True:
state = game.piles.copy()
action = player.choose_action(state)
last_move[game.player]["state"] = state
last_move[game.player]["action"] = action
game.move(action)
new_state = game.piles.copy()
if game.winner is not None:
player.update(state, action, new_state, -1)
player.update(last_move[game.player]["state"], last_move[game.player]["action"], new_state, 1)
break
elif last_move[game.player]["state"] is not None:
player.update(last_move[game.player]["state"], last_move[game.player]["action"], new_state, 0)
return player