３目並べを利用したAlphaGoの学習 tensorflow-1編

1/06/2023

1.概要

AlphaGoの勉強過程で３目並べを学んでいます。前回までプログラムロジックを作成していましたが、今回から３目並べの全ての組合せデータを利用してAIでプログラミングに勝てるかを試します。DeepLearningとしてtensorflowの1.xと2.xの両方を試してみます。内容が多いので複数回に分割して記載をします。

2.詳細

(a) 概要

３目並べのフィールドを３☓３のイメージと考えて、手書き文字認識の手法を利用します。最初に利用する環境はtensorflow-1.15です。入力データはminimax法で活用したすべての手順(9!=362880)の組み合わせの中から勝負が決まった時点の３目並べのフィールド情報と結果（勝ち、負け、引き分け）を利用します。

tensorflowで利用できる形式に変換し、学習をしてモデルを作成し、モデルを利用して３目並べの対戦をします。元情報がminimax法で解析した情報なので、tensorflowによる学習結果がminimax法まで到達できると最高の結果です。大まかな手順は以下の通りです。

(1) minimax法の解析を利用して学習用入力データを作成

(2) 作成した学習用入力データを利用して、tensorflowでモデル作成

(3) tensorflowのモデルを利用して実際に対戦

上記手順を３回に分けて記述します。

(b) 詳細

(1) minimax法の解析を利用して学習用入力データを作成

dlmakedata.pyを作成します。この中で利用するtictactoe.pyはmontecarlo版を利用します。

titactoeのコードは本ブログのTictactoe like the montecarlo(2023/12/30参照)

学習用データは、dl1_data.npy(フィールドデータ)、dl2_data.npy(結果データ)です。

プログラムを実行した結果、学習用データの件数は、255,168件でした。

これは9!=362880よりも少なくなります。最低５手で勝負が決着する場合などがあるためです。

from tictactoe import Tictactoe

import numpy as np

def minimax_select(actions):

r1 = []

r2 = []

for action in actions:

score = obj.do_game(action)

minimax(obj.next_action(), r1, r2)

obj.undo_game(action)

return [r1, r2]

def minimax(actions, r1, r2):

for action in actions:

score = obj.do_game(action)

if score == 1:

s1 = ",".join(map(str,obj.fields))

s2 = "1,0,0"

r1.append(s1)

r2.append(s2)

elif score == -1:

s1 = ",".join(map(str,obj.fields))

s2 = "0,1,0"

r1.append(s1)

r2.append(s2)

elif score == 0:

s1 = ",".join(map(str,obj.fields))

s2 = "0,0,1"

r1.append(s1)

r2.append(s2)

else:

minimax(obj.next_action(), r1, r2)

obj.undo_game(action)

def string_to_array(strlist):

r1 = []

for item in strlist:

f1 = item.split(",")

l1 = []

for s1 in f1:

l1.append(int(s1))

r1.append(l1)

a1 = np.array(r1)

a2 = a1.astype(np.float32)

return a2

if __name__ == "__main__":

obj = Tictactoe()

actions = [0,1,2,3,4,5,6,7,8]

result = minimax_select(actions)

print(len(result[0]), len(result[1]))

r1 = string_to_array(result[0])

r2 = string_to_array(result[1])

print(r1, r1.dtype, r1.shape)

print(r2, r2.dtype, r2.shape)

np.save('dl1_data', r1)

np.save('dl2_data', r2)

y1 = np.load('dl1_data.npy')

y2 = np.load('dl2_data.npy')

print(y1, y1.dtype, y1.shape)

print(y2, y2.dtype, y2.shape)

(2) 作成した学習用入力データを利用して、tensorflowでモデル作成

dltensorflow.pyを作成します。動作環境はtensorflow-1.15の環境です。

学習用データは、r1_data.npy(フィールドデータ)、r2_data.npy(結果データ)で、件数は、255,168件です。これを75%のトレーニングデータと25%のテストデータに分割して動作確認後、再度100%のデータを利用して、もう一度トレーニングをしてモデルを作成、保存します。

下記コードのコメント部分が最初のトレーニング部分です。現在は100%でトレーニングの状態になっています。

完成したモデルは、dlmodel.h5で保存します。

import tensorflow as tf

from tensorflow.keras.layers import Activation, Dense, Dropout

from tensorflow.keras.models import Sequential, load_model

from tensorflow.keras.optimizers import SGD

import numpy as np

import matplotlib.pyplot as plt

images = np.load('r1_data.npy')

labels = np.load('r2_data.npy')

print(images.shape, labels.shape)

count = int(images.shape[0] * 0.75)

train_images, test_images = np.split(images, [count])

print(train_images.shape, test_images.shape)

train_labels, test_labels = np.split(labels, [count])

print(train_labels.shape, test_labels.shape)

model = Sequential()

model.add(Dense(64, activation='sigmoid', input_shape=(9,)))

model.add(Dense(32, activation='sigmoid'))

model.add(Dropout(rate=0.5))

model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.1),

metrics=['acc'])

#history = model.fit(train_images, train_labels, batch_size=500,

# epochs=50, validation_split=0.2)

history = model.fit(images, labels, batch_size=500,

epochs=100, validation_split=0.2)

plt.plot(history.history['acc'], label='acc')

plt.plot(history.history['val_acc'], label='val_acc')

plt.ylabel('accuracy')

plt.xlabel('epoch')

plt.legend(loc='best')

plt.show()

model.save('dlmodel.h5')

model = load_model('dlmodel.h5')

test_loss, test_acc = model.evaluate(test_images, test_labels)

print('loss: {:.3f}\nacc: {:.3f}'.format(test_loss, test_acc))

(3) tensorflowのモデルを利用して実際に対戦

ttttensorflow.pyを作成します。動作環境はtensorflow-1.15環境です。この中で利用するtictactoe.pyはmontecarlo版を利用します。

トレーニングしたモデル(dlmodel.h5)をロードします。最初、モデルの結果だけを利用したのですが、minimax法と同様にリーチ目を認識できません。そこで、alphabeta法で利用したis_reach()も利用しています。感触的にはminimax法と同等程度の手を打つようです。

from tictactoe import Tictactoe

import random

import tensorflow as tf

from tensorflow.keras.models import load_model

import numpy as np

def random_select(actions):

index = random.randint(0, len(actions) - 1)

return actions[index]

def input_select(actions):

while True:

print(actions)

action = int(input('select actions='))

if action in actions:

break

else:

print('input again')

return action

def tensorflow_select(actions):

model = load_model('dlmodel.h5')

if (len(actions) % 2) == 1:

flg = 1

else:

flg = 2

result = []

for action in actions:

reach = obj.is_reach()

if reach != None:

print("reach action ", reach)

return reach

score = obj.do_game(action)

f1 = [obj.fields]

a1 = np.array(f1)

a2 = a1.astype(np.float32)

predictions = model.predict(a2)

l1 = predictions.tolist()

l1[0].append(action)

result.append(l1[0])

obj.undo_game(action)

maxvalue = -1

maxaction = None

for item in result:

value = item[flg-1]

if value > maxvalue:

maxvalue = value

maxaction = item[3]

return maxaction

def montecarlo_select(actions):

if (len(actions) % 2) == 1:

flg = 1

else:

flg = 2

result = []

for action in actions:

reach = obj.is_reach()

if reach != None:

print("reach action ", reach)

return reach

score = obj.do_game(action)

init = [action,0,0,0]

minimax(obj.next_action(), init)

result.append(init)

obj.undo_game(action)

print(result)

maxvalue = -1

maxaction = None

maxlist = []

for item in result:

value = item[flg]

if value > maxvalue:

maxvalue = value

maxaction = item[0]

maxlist = [item[0]]

elif value == maxvalue:

maxlist.append(item[0])

print('maxlist ', maxlist)

if len(maxlist) != 1:

maxaction = maxlist[random.randint(0, len(maxlist) - 1)]

print('maxaction ', maxaction)

return maxaction

def alphabeta_select(actions):

if (len(actions) % 2) == 1:

flg = 1

else:

flg = 2

result = []

for action in actions:

reach = obj.is_reach()

if reach != None:

print("reach action ", reach)

return reach

score = obj.do_game(action)

init = [action,0,0,0]

minimax(obj.next_action(), init)

result.append(init)

obj.undo_game(action)

print(result)

maxvalue = -1

maxaction = None

for item in result:

value = item[flg]

if value > maxvalue:

maxvalue = value

maxaction = item[0]

return maxaction

def minimax_select(actions):

if (len(actions) % 2) == 1:

flg = 1

else:

flg = 2

result = []

for action in actions:

score = obj.do_game(action)

init = [action,0,0,0]

minimax(obj.next_action(), init)

result.append(init)

obj.undo_game(action)

print(result)

maxvalue = -1

maxaction = None

for item in result:

value = item[flg]

if value > maxvalue:

maxvalue = value

maxaction = item[0]

return maxaction

def minimax(actions, result):

for action in actions:

score = obj.do_game(action)

if score == 1:

result[1] += 1

elif score == -1:

result[2] += 1

elif score == 0:

result[3] += 1

else:

minimax(obj.next_action(), result)

obj.undo_game(action)

if __name__ == "__main__":

obj = Tictactoe()

actions = [0,1,2,3,4,5,6,7,8]

for i in range(9):

if obj.myturn == True:

print('my turn')

action = tensorflow_select(actions)

else:

print('other turn')

action = random_select(actions)

print(actions)

print("select", action)

result = obj.do_game(action)

print(obj.game_state())

if result == 1:

print("o Win")

break;

if result == -1:

print("x Win")

break;

if result == 0:

print("Draw")

break;

actions = obj.next_action()

4.所見

学習う結果でも強い手を打ちますが、チョンボもします。この程度は計算で学習できるということです。

参考

[本ブログ内参照]

・テスト駆動開発を利用したfibonacciのコード作成
・３目並べを利用したAlphaGoの学習ロジック作成編
・３目並べを利用したAlphaGoの学習 tensorflow-2編

参考書籍

AlphaZero 深層学習・強化学習・探索人工知能プログラミング実践入門

布留川英一著

検索

Ubuntu User Blog