From d547590543f4a3755869d58c983d66ccc8422e7b Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 19 Jun 2024 23:51:17 -0700 Subject: [PATCH] Update CFR abstraction training --- src/aiplayer.py | 172 ++++++++++++--- src/base.py | 49 +++-- src/environment.py | 10 +- src/holdem_flop.py | 470 ----------------------------------------- src/kuhn.py | 8 +- src/postflop_holdem.py | 104 ++++++--- 6 files changed, 253 insertions(+), 560 deletions(-) delete mode 100644 src/holdem_flop.py diff --git a/src/aiplayer.py b/src/aiplayer.py index 2bfea73..993e497 100644 --- a/src/aiplayer.py +++ b/src/aiplayer.py @@ -73,9 +73,44 @@ def trash_talk_fold(self): self.engine.say(random.choice(self.get_trash_talk("opponent_fold"))) self.engine.runAndWait() - def place_bet(self, observed_env) -> int: # AI will call every time + def process_action(self, action, observed_env): + if action == "k": # check + if observed_env.game_stage == 2: + self.current_bet = 2 + else: + self.current_bet = 0 + + self.engine.say("I Check") + elif action == "c": + if observed_env.get_highest_current_bet() == self.player_balance: + self.engine.say("I call your all-in. You think I'm afraid?") + else: + self.engine.say(random.choice(self.get_trash_talk("c"))) + # If you call on the preflop + self.current_bet = observed_env.get_highest_current_bet() + elif action == "f": + self.engine.say(random.choice(self.get_trash_talk("f"))) + else: + self.current_bet = int(action[1:]) + if self.current_bet == self.player_balance: + self.engine.say(random.choice(self.get_trash_talk("all_in"))) + else: + self.engine.say(random.choice(self.get_trash_talk("b", self.current_bet))) + + self.engine.runAndWait() + + def place_bet(self, observed_env): + raise NotImplementedError + - # Strategy with Heuristic +class EquityAIPlayer(AIPlayer): + def __init__(self, balance) -> None: + super().__init__(balance) + + def place_bet(self, observed_env) -> int: # AI will call every time + """ + A Strategy implemented with human heuristics + """ if "k" in observed_env.valid_actions(): action = "k" else: @@ -83,7 +118,7 @@ def place_bet(self, observed_env) -> int: # AI will call every time card_str = [str(card) for card in self.hand] community_cards = [str(card) for card in observed_env.community_cards] - # if observed_env.game_stage == 2: + equity = calculate_equity(card_str, community_cards) # fold, check / call, raise @@ -96,7 +131,9 @@ def place_bet(self, observed_env) -> int: # AI will call every time ): # If you are the dealer, raise more of the time strategy = { "k": np_strategy[0], - f"b{min(max(observed_env.BIG_BLIND, int(observed_env.total_pot_balance / 3)), self.player_balance)}": np_strategy[2], + f"b{min(max(observed_env.BIG_BLIND, int(observed_env.total_pot_balance / 3)), self.player_balance)}": np_strategy[ + 2 + ], f"b{min(observed_env.total_pot_balance, self.player_balance)}": np_strategy[1], } else: @@ -138,37 +175,118 @@ def place_bet(self, observed_env) -> int: # AI will call every time print("equity", equity) print("AI strategy ", strategy) action = getAction(strategy) + self.process_action(action, observed_env) + return action - # history = HoldEmHistory(observed_env.history) - # strategy = observed_env.get_average_strategy() - # print("AI strategy", strategy) - # print("AI action", action) +import joblib +from abstraction import calculate_equity, predict_cluster_fast +from postflop_holdem import HoldemInfoSet, HoldEmHistory - if action == "k": # check - if observed_env.game_stage == 2: - self.current_bet = 2 - else: - self.current_bet = 0 +import copy - self.engine.say("I Check") - elif action == "c": - if observed_env.get_highest_current_bet() == self.player_balance: - self.engine.say("I call your all-in. You think I'm afraid?") + +class CFRAIPlayer(AIPlayer): + def __init__(self, balance) -> None: + super().__init__(balance) + + self.infosets = joblib.load("../src/infoSets_batch_7.joblib") + + def perform_postflop_abstraction(self, observed_env): + history = copy.deepcopy(observed_env.history) + + pot_total = observed_env.BIG_BLIND * 2 + # Compute preflop pot size + flop_start = history.index("/") + for i, action in enumerate(history[:flop_start]): + if action[0] == "b": + bet_size = int(action[1:]) + pot_total = 2 * bet_size + + # Remove preflop actions + abstracted_history = history[:2] + + # Bet Abstraction (card abstraction is done later) + stage_start = flop_start + stage = self.get_stage(history[stage_start + 1 :]) + latest_bet = 0 + while True: + abstracted_history += ["/"] + + if ( + len(stage) >= 4 and stage[3] != "c" + ): # length 4 that isn't a call, we need to condense down + abstracted_history += [stage[0]] + + if stage[-1] == "c": + if len(stage) % 2 == 1: # ended on dealer + abstracted_history += ["bMAX", "c"] + else: + if stage[0] == "k": + abstracted_history += ["k", "bMAX", "c"] + else: + abstracted_history += ["bMIN", "bMAX", "c"] else: - self.engine.say(random.choice(self.get_trash_talk("c"))) - # If you call on the preflop - self.current_bet = observed_env.get_highest_current_bet() - elif action == "f": - self.engine.say(random.choice(self.get_trash_talk("f"))) + for i, action in enumerate(stage): + if action[0] == "b": + bet_size = int(action[1:]) + latest_bet = bet_size + pot_total += bet_size + + # this is a raise on a small bet + if abstracted_history[-1] == "bMIN": + abstracted_history += ["bMAX"] + # this is a raise on a big bet + elif abstracted_history[-1] == "bMAX": + abstracted_history[-1] = "k" # turn into a check + else: # first bet + if bet_size >= pot_total: + abstracted_history += ["bMAX"] + else: + abstracted_history += ["bMIN"] + + elif action == "c": + pot_total += latest_bet + abstracted_history += ["c"] + else: + abstracted_history += [action] + + # Proceed to next stage or exit if final stage + if "/" not in history[stage_start + 1 :]: + break + stage_start = history[stage_start + 1 :].index("/") + (stage_start + 1) + stage = self.get_stage(history[stage_start + 1 :]) + + return abstracted_history + + def get_stage(self, history): + if "/" in history: + return history[: history.index("/")] else: - self.current_bet = int(action[1:]) - if self.current_bet == self.player_balance: - self.engine.say(random.choice(self.get_trash_talk("all_in"))) + return history + + def place_bet(self, observed_env): + if observed_env.game_stage == 2: # preflop + if "k" in observed_env.valid_actions(): + action = "k" else: - self.engine.say(random.choice(self.get_trash_talk("b", self.current_bet))) + action = "c" + else: + abstracted_history = self.perform_postflop_abstraction(observed_env) + print("abstracted history", abstracted_history) + infoset_key = HoldEmHistory(abstracted_history).get_infoSet_key_online() + strategy = self.infosets[infoset_key].get_average_strategy() + print(infoset_key) + print("AI strategy ", strategy) + action = getAction(strategy) + if action == "bMIN": + action = "b" + str( + max(observed_env.BIG_BLIND, int(1 / 3 * observed_env.total_pot_balance)) + ) + elif action == "bMAX": + action = "b" + str(min(observed_env.total_pot_balance, self.player_balance)) - self.engine.runAndWait() + self.process_action(action, observed_env) return action diff --git a/src/base.py b/src/base.py index 2fc5ba9..e8ffe1a 100644 --- a/src/base.py +++ b/src/base.py @@ -161,11 +161,10 @@ def __init__( create_history, n_players: int = 2, iterations: int = 1000000, - tracker_interval=1000, ): self.n_players = n_players self.iterations = iterations - self.tracker_interval = tracker_interval + self.tracker_interval = int(iterations / 10) self.infoSets: Dict[str, InfoSet] = {} self.create_infoSet = create_infoSet self.create_history = create_history @@ -193,7 +192,7 @@ def vanilla_cfr( if history.is_terminal(): if debug: print(f"history: {history.history} utility: {history.terminal_utility(i)}") - time.sleep(1) + time.sleep(0.1) return history.terminal_utility(i) elif history.is_chance(): a = ( @@ -206,9 +205,6 @@ def vanilla_cfr( infoSet = self.get_infoSet(history) assert infoSet.player() == history.player() - if debug: - print("infoset", infoSet.to_dict()) - v = 0 va = {} @@ -233,19 +229,30 @@ def vanilla_cfr( # Update regret matching values infoSet.get_strategy() + if debug: + print("infoset", infoSet.to_dict()) + print("strategy", infoSet.strategy) + return v def vanilla_cfr_speedup(self, history: History, t: int, pi_0: float, pi_1: float, debug=False): """ We double the speed by updating both player values simultaneously, since this is a zero-sum game. + NOTE: Doesn't work super well, I don't understand why. The trick here to speedup is by assuming by whatever the opponent gains is + the opposite of what we gain. Zero-sum game. However, need to make sure we always return the correct utility. + """ # Return payoff for terminal states + # ['3d7c', '4cQd', '/', '7sKd9c', 'bMIN', 'f'] if history.is_terminal(): if debug: - print(history.history, history.terminal_utility(0)) - time.sleep(1) - return history.terminal_utility(0) + print( + f"utility returned: {history.terminal_utility((len(history.get_last_game_stage())) % 2)}, history: {history.history}" + ) + return history.terminal_utility( + (len(history.get_last_game_stage()) + 1) % 2 + ) # overfit solution for holdem elif history.is_chance(): a = ( history.sample_chance_outcome() @@ -257,9 +264,6 @@ def vanilla_cfr_speedup(self, history: History, t: int, pi_0: float, pi_1: float infoSet = self.get_infoSet(history) assert infoSet.player() == history.player() - if debug: - print("infoset", infoSet.to_dict()) - v = 0 va = {} @@ -285,6 +289,12 @@ def vanilla_cfr_speedup(self, history: History, t: int, pi_0: float, pi_1: float # Update regret matching values infoSet.get_strategy() + if debug: + print("infoset", infoSet.to_dict()) + print("va", va) + print("strategy", infoSet.strategy) + time.sleep(0.1) + return v def vanilla_cfr_manim( @@ -356,11 +366,11 @@ def solve(self, method="vanilla_speedup", debug=False): for player in range(self.n_players): if player == 0: util_0 += self.vanilla_cfr_manim( - self.create_history(), player, t, 1, 1, histories + self.create_history(t), player, t, 1, 1, histories ) else: util_1 += self.vanilla_cfr_manim( - self.create_history(), player, t, 1, 1, histories + self.create_history(t), player, t, 1, 1, histories ) print(histories) @@ -371,11 +381,11 @@ def solve(self, method="vanilla_speedup", debug=False): ): # This is the slower way, we can speed by updating both players if player == 0: util_0 += self.vanilla_cfr( - self.create_history(), player, t, 1, 1, debug=debug + self.create_history(t), player, t, 1, 1, debug=debug ) else: util_1 += self.vanilla_cfr( - self.create_history(), player, t, 1, 1, debug=debug + self.create_history(t), player, t, 1, 1, debug=debug ) if (t + 1) % self.tracker_interval == 0: @@ -384,13 +394,14 @@ def solve(self, method="vanilla_speedup", debug=False): self.tracker(self.infoSets) self.tracker.pprint() - if t % 2500 == 0: + if t % 500000 == 0: self.export_infoSets(f"infoSets_{t}.joblib") + self.export_infoSets("infoSets_solved.joblib") if method == "manim": return histories - def export_infoSets(self, filename = "infoSets.joblib"): + def export_infoSets(self, filename="infoSets.joblib"): joblib.dump(self.infoSets, filename) def get_expected_value( @@ -525,4 +536,4 @@ def __call__(self, infoSets: Dict[str, InfoSet]): def pprint(self): infoSets = self.tracker_hist[-1] for infoSet in infoSets.values(): - print(infoSet.infoSet, infoSet.get_average_strategy()) + print(infoSet.infoSet, "Regret: ", infoSet.regret, "Average Strategy: ", infoSet.get_average_strategy()) diff --git a/src/environment.py b/src/environment.py index 6ec79b2..37b8952 100644 --- a/src/environment.py +++ b/src/environment.py @@ -2,7 +2,8 @@ from evaluator import * from typing import List from player import Player -from aiplayer import AIPlayer +from postflop_holdem import PostflopHoldemHistory, PostflopHoldemInfoSet +from aiplayer import CFRAIPlayer class PokerEnvironment: @@ -42,7 +43,7 @@ def __init__(self) -> None: self.SMALL_BLIND = 1 self.BIG_BLIND = 2 - self.INPUT_CARDS = True + self.INPUT_CARDS = False self.history = [] self.players_balance_history = [] # List of "n" list for "n" players @@ -54,7 +55,7 @@ def get_player(self, idx) -> Player: return self.players[idx] def add_AI_player(self): # Add a dumb AI - self.players.append(AIPlayer(self.new_player_balance)) + self.players.append(CFRAIPlayer(self.new_player_balance)) self.AI_player_idx = len(self.players) - 1 def get_winning_players(self) -> List: @@ -358,7 +359,7 @@ def end_round(self): if player.playing_current_round: player.trash_talk_win() else: - player.get_trash_lose() + player.trash_talk_lose() else: for player in self.players: @@ -366,6 +367,5 @@ def end_round(self): if player.playing_current_round: player.trash_talk_fold() - self.game_stage = 6 # mark end of round self.distribute_pot_to_winning_players() diff --git a/src/holdem_flop.py b/src/holdem_flop.py deleted file mode 100644 index 44544b7..0000000 --- a/src/holdem_flop.py +++ /dev/null @@ -1,470 +0,0 @@ -""" -Abstracted version of Holdem Poker, used for training. - -To make this computationally feasible to solve on my macbook, I start solving at the flop. - -Card Abstraction -- 10 clusters for flop -- 5 clusters for turn -- 5 clusters for river - -10 * 5 * 5 = 250 clusters - -Bet abstraction (ONLY allow these 11 sequences) -- k ("check") -- bMIN ("bet 1/3 pot, or big blind if pot is too") -- bMAX ("bet the pot size") -- c ("call") -- f ("fold") - -kk -kbMINf -kbMINc -kbPOTf -kbPOTc -bMINf -bMINc -bMINbMAXf # opponent raises on you -bMINbMAXc # opponent raises on you -bPOTf -bPOTc - -11^3 = 1331 possible sequences (3 stages: flop, turn, river) - -In total, we have 250 * 1331 = 332750 information sets. - -This keeps it manageable. Anything more is in orders of millions... -""" - -import base -import numpy as np -from base import Player, Action -from tqdm import tqdm -from typing import List -from abstraction import ( - predict_cluster_fast, -) -from fast_evaluator import phEvaluatorSetup, evaluate_cards -import time - -DISCRETE_ACTIONS = ["k", "bMIN", "bMAX", "c", "f"] - - -# ----- GLOBAL VARIABLES Load the pre-generated dataset ----- -def load_dataset(): - global boards, player_hands, opponent_hands - global player_flop_clusters, player_turn_clusters, player_river_clusters - global opp_preflop_clusters, opp_flop_clusters, opp_turn_clusters, opp_river_clusters - global winners - - # Load the pre-generated dataset - boards = np.load("dataset/boards.npy").tolist() - player_hands = np.load("dataset/player_hands.npy").tolist() - opponent_hands = np.load("dataset/opponent_hands.npy").tolist() - - # Load player clusters - player_flop_clusters = np.load("dataset/player_flop_clusters.npy").tolist() - player_turn_clusters = np.load("dataset/player_turn_clusters.npy").tolist() - player_river_clusters = np.load("dataset/player_river_clusters.npy").tolist() - - # Load opponent clusters - opp_flop_clusters = np.load("dataset/opp_flop_clusters.npy").tolist() - opp_turn_clusters = np.load("dataset/opp_turn_clusters.npy").tolist() - opp_river_clusters = np.load("dataset/opp_river_clusters.npy").tolist() - - winners = np.load("dataset/winners.npy") - - -class HoldEmHistory(base.History): - """ - Example of history: - First two actions are the cards dealt to the players. The rest of the actions are the actions taken by the players. - 1. ['AkTh', 'QdKd', '/', 'QhJdKs', 'bMIN', 'c', '/', 'Ah', 'k', 'k', ...] - - Infoset: - [4, 'bMIN', 'c', '10', 'k', 'k', ...] - - ---- ACTIONS ---- - - k = check - - bX = bet X amount (this includes raising) - - c = call - - f = fold (you cannot fold if the other player just checked) - - Every round starts the same way: - Small blind = 1 chip - Big blind = 2 chips - - Total chips = 100BB per player. - Minimum raise = X to match bet, and Y is the raise amount - If no raise before, then the minimum raise amount is 2x the bet amount (preflop would be 2x big blind). - Else it is whatever was previously raised. This is not the same as 2x the previous bet amount. Just the Y raise amount. - - Ex: The bet is 10$. I raise to 50$, so I raised by 40$ (Y = 40). The next player's minimum raise is not 100$, but rather to 90$, since (it's 50$ to match the bet, and 40$ to match the raise). - - Minimum bet = 1 chip (0.5BB) - - The API for the history is inspired from the Slumbot API, https://www.slumbot.com/ - - I want to avoid all the extra overhead, so taking inspiration from `environment.py` with the `PokerEnvironment` - """ - - def __init__(self, history: List[Action] = [], sample_id=0): - super().__init__(history) - self.sample_id = sample_id % len(player_hands) - self.stage_i = history.count("/") - - def is_terminal(self): - if len(self.history) == 0: - return False - folded = self.history[-1] == "f" - is_showdown = self.stage_i == 3 and self._game_stage_ended() # call # check,check - if folded or is_showdown: - return True - else: - return False - - def actions(self): - if self.is_chance(): # draw cards - return ( - [] - ) # This should return the entire deck with current cards removed, but I do this for speedup by loading an existing dataset - - elif not self.is_terminal(): - """ - To limit this game going to infinity, I only allow 11 betting seqeunces. - Else the branching factor huge. - - kk - kbMINf - kbMINc - kbMAXf - kbMAXc - bMINf - bMINc - bMINbMAXf - bMINbMAXc - bMAXf - bMAXc - - This is easy calculation. If someone raises, then treat that as bMAX. - - If we raise and the opponent raises, then we treat that as bMAX. So this way, we can always - treat the last action as bMAX. - - bMINbMAX = kBMAX - """ - assert ( - not self._game_stage_ended() - ) # game_stage_ended would mean that it is a chance node - - if self.history[-1] == "k": - return ["k", "bMIN", "bMAX"] - elif self.history[-1] == "bMIN": - return ["f", "c", "bMAX"] - elif self.history[-1] == "bMAX": - return ["f", "c"] - else: - return ["k", "bMIN", "bMAX", "f"] - - else: - raise Exception("Cannot call actions on a terminal history") - - def player(self): - """ - 1. ['AkTh', 'QdKd', '/', 'Qh', 'b2', 'c', '/', '2d', b2', 'f'] - """ - if len(self.history) <= 3: - return -1 - elif self._game_stage_ended(): - return -1 - elif self.history[-1] == "/": - return -1 - else: - return len(self.history) % 2 - - def _game_stage_ended(self): - return self.history[-1] == "c" or self.history[-1] == "f" or self.history[-2:] == ["k", "k"] - - def is_chance(self): - return super().is_chance() - - def sample_chance_outcome(self): - assert self.is_chance() - - if len(self.history) == 0: - return "".join(player_hands[self.sample_id]) - elif len(self.history) == 1: - return "".join(opponent_hands[self.sample_id]) - elif self.history[-1] != "/": - return "/" - elif self.stage_i == 1: - return "".join(boards[self.sample_id][:3]) - elif self.stage_i == 2: - return boards[self.sample_id][3] - elif self.stage_i == 3: - return boards[self.sample_id][4] - - def terminal_utility(self, i: Player) -> int: - assert self.is_terminal() # We can only call the utility for a terminal history - assert i in [0, 1] # Only works for 2 player games for now - - winner = winners[self.sample_id] - - pot_size = self._get_total_pot_size() - - last_game_stage_start_idx = max(loc for loc, val in enumerate(self.history) if val == "/") - last_game_stage = self.history[last_game_stage_start_idx + 1:] - - if self.history[-1] == "f": - if len(last_game_stage) % 2 == i: - return -pot_size - else: - return pot_size - - # showdown - if winner == 0: # tie - return pot_size / 2 - - if (winner == 1 and i == 0) or (winner == -1 and i == 1): - return pot_size - else: - return -pot_size - - def _get_total_pot_size(self): - total = 0 # starting balance is 4 - stage_total = 4 - for idx, action in enumerate(self.history): - if action == "/": - total += stage_total - stage_total = 0 - elif action == "bMIN": - stage_total += max(2, int(total / 3)) # bet 1/3 pot - elif action == "bMAX": - stage_total += total # bet the pot - elif action == "c": - if self.history[idx - 1] == "bMIN": - stage_total += max(2, int(total / 3)) - elif self.history[idx - 1] == "bMAX" and self.history[idx - 2] == "bMIN": - stage_total = 2 * total - elif self.history[idx - 1] == "bMAX": - stage_total += total - - stage_total = total - - total += stage_total - return total - - def __add__(self, action: Action): - new_history = HoldEmHistory(self.history + [action], self.sample_id) - return new_history - - def get_infoSet_key(self) -> List[Action]: - """ - This is where we abstract away cards and bet sizes. - """ - assert not self.is_chance() - assert not self.is_terminal() - - player = self.player() - infoset = [] - # ------- CARD ABSTRACTION ------- - # Assign cluster ID for FLOP/TURN/RIVER - stage_i = 0 - for i, action in enumerate(self.history): - if action not in DISCRETE_ACTIONS: - if action == "/": - stage_i += 1 - continue - if stage_i == 1: - if player == 0: - infoset.append(str(player_flop_clusters[self.sample_id])) - else: - infoset.append(str(opp_flop_clusters[self.sample_id])) - elif stage_i == 2: - assert len(action) == 2 - if player == 0: - infoset.append(str(player_turn_clusters[self.sample_id])) - else: - infoset.append(str(opp_turn_clusters[self.sample_id])) - elif stage_i == 3: - assert len(action) == 2 - if player == 0: - infoset.append(str(player_river_clusters[self.sample_id])) - else: - infoset.append(str(opp_river_clusters[self.sample_id])) - else: - infoset.append(action) - - return infoset - - -class HoldemInfoSet(base.InfoSet): - """ - Information Sets (InfoSets) cannot be chance histories, nor terminal histories. - This condition is checked when infosets are created. - - This infoset is an abstracted versions of the history in this case. - See the `get_infoSet_key(self)` function for these - - There are 2 abstractions we are doing: - 1. Card Abstraction (grouping together similar hands) - 2. Action Abstraction - - I've imported my abstractions from `abstraction.py`. - - """ - - def __init__(self, infoSet_key: List[Action], actions: List[Action], player: Player): - assert len(infoSet_key) >= 1 - super().__init__(infoSet_key, actions, player) - - -def create_infoSet(infoSet_key: List[Action], actions: List[Action], player: Player): - """ - We create an information set from a history. - """ - return HoldemInfoSet(infoSet_key, actions, player) - - -def create_history(sample_id): - return HoldEmHistory(sample_id=sample_id) - - -class HoldemCFR(base.CFR): - def __init__( - self, - create_infoSet, - create_history, - n_players: int = 2, - iterations: int = 1000000, - ): - super().__init__(create_infoSet, create_history, n_players, iterations) - - -from joblib import Parallel, delayed - - -def evaluate_winner(board, player_hand, opponent_hand): - p1_score = evaluate_cards(*(board + player_hand)) - p2_score = evaluate_cards(*(board + opponent_hand)) - if p1_score < p2_score: - return 1 - elif p1_score > p2_score: - return -1 - else: - return 0 - - -def generate_dataset(num_samples=250000, save=True): - """ - To make things faster, we pre-generate the boards and hands. We also pre-cluster the hands - """ - boards, player_hands, opponent_hands = phEvaluatorSetup(num_samples) - - np_boards = np.array(boards) - np_player_hands = np.array(player_hands) - np_opponent_hands = np.array(opponent_hands) - - player_flop_cards = np.concatenate((np_player_hands, np_boards[:, :3]), axis=1).tolist() - player_turn_cards = np.concatenate((np_player_hands, np_boards[:, :4]), axis=1).tolist() - player_river_cards = np.concatenate((np_player_hands, np_boards), axis=1).tolist() - opp_flop_cards = np.concatenate((np_opponent_hands, np_boards[:, :3]), axis=1).tolist() - opp_turn_cards = np.concatenate((np_opponent_hands, np_boards[:, :4]), axis=1).tolist() - opp_river_cards = np.concatenate((np_opponent_hands, np_boards), axis=1).tolist() - - curr = time.time() - print("generating clusters") - - player_flop_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=10) - for cards in tqdm(player_flop_cards) - ) - player_turn_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=5) - for cards in tqdm(player_turn_cards) - ) - player_river_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=5) - for cards in tqdm(player_river_cards) - ) - - opp_flop_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=10) - for cards in tqdm(opp_flop_cards) - ) - opp_turn_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=500, total_clusters=5) - for cards in tqdm(opp_turn_cards) - ) - opp_river_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=200, total_clusters=5) - for cards in tqdm(opp_river_cards) - ) - - winners = Parallel(n_jobs=-1)( - delayed(evaluate_winner)(board, player_hand, opponent_hand) - for board, player_hand, opponent_hand in tqdm(zip(boards, player_hands, opponent_hands)) - ) - - print("saving datasets") - np.save("dataset/boards.npy", boards) - np.save("dataset/player_hands.npy", player_hands) - np.save("dataset/opponent_hands.npy", opponent_hands) - np.save("dataset/winners.npy", winners) - print("continuing to save datasets") - - np.save("dataset/player_flop_clusters.npy", player_flop_clusters) - np.save("dataset/player_turn_clusters.npy", player_turn_clusters) - np.save("dataset/player_river_clusters.npy", player_river_clusters) - - np.save("dataset/opp_flop_clusters.npy", opp_flop_clusters) - np.save("dataset/opp_turn_clusters.npy", opp_turn_clusters) - np.save("dataset/opp_river_clusters.npy", opp_river_clusters) - - print(time.time() - curr) - - -if __name__ == "__main__": - generate_dataset() - load_dataset() - cfr = HoldemCFR(create_infoSet, create_history) - # cfr.infoSets = joblib.load("infoSets_2500.joblib") - # print("finished loading") - # cfr.solve(debug=True) - cfr.solve() - # cfr.solve_multiprocess( - # initializer=load_dataset, - # ) - -# """ -# When we work with these abstractions, we have two types: -# 1. Action Abstraction -# 2. Card Abstraction - -# Both of these are implemented in a different way. - -# """ - -# hist: HoldEmHistory = create_history() -# assert hist.player() == -1 -# hist1 = hist + "AkTh" -# assert hist1.player() == -1 -# hist2 = hist1 + "QdKd" -# assert hist2.player() == 0 -# print(hist2.get_infoSet_key(kmeans_flop, kmeans_turn, kmeans_river)) -# hist3 = hist2 + "b2" -# assert hist3.player() == 1 -# hist4 = hist3 + "c" -# assert hist4.player() == -1 -# # Below are chance events, so it doesn't matter which player it is -# hist5 = hist4 + "/" -# assert hist5.player() == -1 -# hist6 = hist5 + "QhKsKh" -# assert hist6.player() == 1 -# hist7 = hist6 + "b1" -# hist8: HoldEmHistory = hist7 + "b3" -# curr = time.time() -# print(hist8.get_infoSet_key(kmeans_flop, kmeans_turn, kmeans_river), time.time() - curr) - -# # cfr = base.CFR(create_infoSet, create_history) -# # cfr.solve() diff --git a/src/kuhn.py b/src/kuhn.py index cf80e27..689bc61 100644 --- a/src/kuhn.py +++ b/src/kuhn.py @@ -117,7 +117,7 @@ class KuhnInfoSet(base.InfoSet): """ def __init__(self, infoSet_key: List[Action], actions: List[Action], player: Player): - assert len(infoSet) >= 2 + assert len(infoSet_key) >= 2 super().__init__(infoSet_key, actions, player) @@ -128,12 +128,12 @@ def create_infoSet(infoSet_key: List[Action], actions: List[Action], player: Pla return KuhnInfoSet(infoSet_key, actions, player) -def create_history(): +def create_history(t): return KuhnHistory() if __name__ == "__main__": - cfr = base.CFR(create_infoSet, create_history) - cfr.solve() + cfr = base.CFR(create_infoSet, create_history, iterations=5000) + cfr.solve(debug=False, method="vanilla") # TODO: Add playing option, right now there is old code in research/kuhn, # which is not oop diff --git a/src/postflop_holdem.py b/src/postflop_holdem.py index 380e34d..829d90e 100644 --- a/src/postflop_holdem.py +++ b/src/postflop_holdem.py @@ -5,10 +5,10 @@ Card Abstraction - 10 clusters for flop -- 5 clusters for turn -- 5 clusters for river +- 10 clusters for turn +- 10 clusters for river -10 * 5 * 5 = 250 clusters +10^3 = 1000 clusters Bet abstraction (ONLY allow these 11 sequences) - k ("check") @@ -31,7 +31,8 @@ 11^3 = 1331 possible sequences (3 stages: flop, turn, river) -In total, we have 250 * 1331 = 332750 information sets. +In total, we have 1000 * 1331 = 1 331 000 information sets. +However, i noticed that only ~10% of the information sets are actually visited, since huge transitions are not possible. This keeps it manageable. Anything more is in orders of millions... """ @@ -42,7 +43,7 @@ from tqdm import tqdm from typing import List from abstraction import ( - predict_cluster_pre, + predict_cluster_fast, ) from fast_evaluator import phEvaluatorSetup, evaluate_cards import time @@ -75,7 +76,7 @@ def load_dataset(): winners = np.load("dataset/winners.npy") -class HoldEmHistory(base.History): +class PostflopHoldemHistory(base.History): """ Example of history: First two actions are the cards dealt to the players. The rest of the actions are the actions taken by the players. @@ -110,7 +111,7 @@ class HoldEmHistory(base.History): def __init__(self, history: List[Action] = [], sample_id=0): super().__init__(history) - self.sample_id = sample_id % len(player_hands) + self.sample_id = sample_id self.stage_i = history.count("/") def is_terminal(self): @@ -159,8 +160,10 @@ def actions(self): if self.history[-1] == "k": return ["k", "bMIN", "bMAX"] - elif self.history[-1] == "bMIN": + elif self.history[-2:] == ["k", "bMIN"]: return ["f", "c"] + elif self.history[-1] == ["bMIN"]: + return ["bMAX", "f", "c"] elif self.history[-1] == "bMAX": return ["f", "c"] else: @@ -237,8 +240,6 @@ def terminal_utility(self, i: Player) -> int: else: return -pot_size / 2 - - def _get_total_pot_size(self, history): total = 0 stage_total = 4 # assume preflop is a check + call, so 4 in pot (1 BB = 2 chips) @@ -262,9 +263,50 @@ def _get_total_pot_size(self, history): return total def __add__(self, action: Action): - new_history = HoldEmHistory(self.history + [action], self.sample_id) + new_history = PostflopHoldemHistory(self.history + [action], self.sample_id) return new_history + def get_infoSet_key_online(self) -> List[Action]: + history = self.history + player = self.player() + infoset = [] + # ------- CARD ABSTRACTION ------- + # Assign cluster ID for FLOP/TURN/RIVER + stage_i = 0 + hand = [] + if player == 0: + hand = [history[0][:2], history[0][2:4]] + else: + hand = [history[1][:2], history[1][2:4]] + community_cards = [] + for i, action in enumerate(history): + if action not in DISCRETE_ACTIONS: + if action == "/": + stage_i += 1 + continue + if stage_i != 0: + community_cards += [history[i][j : j + 2] for j in range(0, len(action), 2)] + print(hand + community_cards) + if stage_i == 1: + assert len(action) == 6 + infoset.append( + str(predict_cluster_fast(hand + community_cards, total_clusters=10)) + ) + elif stage_i == 2: + assert len(action) == 2 + infoset.append( + str(predict_cluster_fast(hand + community_cards, total_clusters=5)) + ) + elif stage_i == 3: + assert len(action) == 2 + infoset.append( + str(predict_cluster_fast(hand + community_cards, total_clusters=5)) + ) + else: + infoset.append(action) + + return "".join(infoset) + def get_infoSet_key(self) -> List[Action]: """ This is where we abstract away cards and bet sizes. @@ -305,7 +347,7 @@ def get_infoSet_key(self) -> List[Action]: return infoset -class HoldemInfoSet(base.InfoSet): +class PostflopHoldemInfoSet(base.InfoSet): """ Information Sets (InfoSets) cannot be chance histories, nor terminal histories. This condition is checked when infosets are created. @@ -330,14 +372,14 @@ def create_infoSet(infoSet_key: List[Action], actions: List[Action], player: Pla """ We create an information set from a history. """ - return HoldemInfoSet(infoSet_key, actions, player) + return PostflopHoldemInfoSet(infoSet_key, actions, player) def create_history(sample_id): - return HoldEmHistory(sample_id=sample_id) + return PostflopHoldemHistory(sample_id=sample_id) -class PostFlopHoldemCFR(base.CFR): +class PostflopHoldemCFR(base.CFR): def __init__( self, create_infoSet, @@ -383,29 +425,24 @@ def generate_dataset(num_samples=50000, save=True): print("generating clusters") player_flop_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=10) - for cards in tqdm(player_flop_cards) + delayed(predict_cluster_fast)(cards, total_clusters=10) for cards in tqdm(player_flop_cards) ) player_turn_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=5) - for cards in tqdm(player_turn_cards) + delayed(predict_cluster_fast)(cards, total_clusters=10) for cards in tqdm(player_turn_cards) ) player_river_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=5) + delayed(predict_cluster_fast)(cards, total_clusters=10) for cards in tqdm(player_river_cards) ) opp_flop_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=1000, total_clusters=10) - for cards in tqdm(opp_flop_cards) + delayed(predict_cluster_fast)(cards, total_clusters=10) for cards in tqdm(opp_flop_cards) ) opp_turn_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=500, total_clusters=5) - for cards in tqdm(opp_turn_cards) + delayed(predict_cluster_fast)(cards, total_clusters=10) for cards in tqdm(opp_turn_cards) ) opp_river_clusters = Parallel(n_jobs=-1)( - delayed(predict_cluster_fast)(cards, n=200, total_clusters=5) - for cards in tqdm(opp_river_cards) + delayed(predict_cluster_fast)(cards, total_clusters=10) for cards in tqdm(opp_river_cards) ) winners = Parallel(n_jobs=-1)( @@ -436,14 +473,11 @@ def generate_dataset(num_samples=50000, save=True): if __name__ == "__main__": # Train in batches of 50,000 hands ITERATIONS = 50000 - cfr = PostFlopHoldemCFR(create_infoSet, create_history, iterations=ITERATIONS) + cfr = PostflopHoldemCFR(create_infoSet, create_history, iterations=ITERATIONS) for i in range(20): - if i == 0: - load_dataset() - else: - generate_dataset(save=False, num_samples=ITERATIONS) - cfr.solve(debug=False, method="vanilla") - cfr.export_infoSets(f"infoSets_batch_{i}.joblib") + generate_dataset(save=False, num_samples=ITERATIONS) + cfr.solve(debug=False, method="vanilla_speedup") + cfr.export_infoSets(f"new_vanilla_speedup_infoSets_batch_{i}.joblib") # load_dataset() # cfr.infoSets = joblib.load("infoSets_2500.joblib") @@ -462,7 +496,7 @@ def generate_dataset(num_samples=50000, save=True): # """ -# hist: HoldEmHistory = create_history() +# hist: PostflopHoldemHistory = create_history() # assert hist.player() == -1 # hist1 = hist + "AkTh" # assert hist1.player() == -1 @@ -479,7 +513,7 @@ def generate_dataset(num_samples=50000, save=True): # hist6 = hist5 + "QhKsKh" # assert hist6.player() == 1 # hist7 = hist6 + "b1" -# hist8: HoldEmHistory = hist7 + "b3" +# hist8: PostflopHoldemHistory = hist7 + "b3" # curr = time.time() # print(hist8.get_infoSet_key(kmeans_flop, kmeans_turn, kmeans_river), time.time() - curr)