Implementing a loss function (MSVE) in Reinforcement learning
I am trying to build a temporal difference learning agent for Othello. While the rest of my implementation seems to run as intended I am wondering about the loss function used to train my network. In Sutton's book "Reinforcement learning: An Introduction", the Mean Squared Value Error (MSVE is presented as the standard loss function. It is basically a Mean Square Error multiplied with the on policy distribution. (Sum over all states s ( onPolicyDistribution(s) * [V(s)  V'(s,w)]² ) )
My question is now: How do I obtain this on policy distribution when my policy is an egreedy function of a learned value function? Is it even necessary and what's the issue if I just use an MSELoss instead?
I'm implementing all of this in pytorch, so bonus points for an easy implementation there :)
See also questions close to this topic

trouble implementing Breakout DeepMind's model
I am trying to follow DeepMind's paper on Qlearning for the game breakout, and so far the performance is not improving i.e. it is not learning anything at all. Instead of experience replay , i am just running game, saving some data and training and then again running game. I've put up comments to explain my implementation, any help is much appreciated. Also i may be missing some key points, please have a look.
I am sending 4 frames as input and a onehot matrix of key pressed multiplied with reward for that key press. Also i am trying with BreakoutDetermisticv0, as mentioned in the paper
import gym import tflearn import numpy as np import cv2 from collections import deque from tflearn.layers.estimator import regression from tflearn.layers.core import input_data, dropout, fully_connected from tflearn.layers.conv import conv_2d game = "BreakoutDeterministicv4" env = gym.make(game) env.reset() LR = 1e3 num_games = 10 # arbitrary number, not final num_frames = 500 possible_actions = env.action_space.n accepted_score = 2 MODEL_NAME = 'data/Model_{}' gamma = 0.9 epsilon = 0.7 generations = 30 # arbitrary number, not final height = 84 width = 84 # instead of using experience replay, i'm simply calling this function in generations to generate training data def play4data(gen): training_data = [] for i in range(num_games): score = 0 data = [] prev_observation = [] env.reset() done = False d = deque() while not done: # env.render() # if it's 0th generation, model hasn't been trained yet, so can't call predict funtion # or if i want to take a random action based on some fixed epsilon value # or if it's in later gens , but doesn't have 4 frames yet , to send to model if gen == 0 or len(prev_observation)==0 or np.random.rand() <= epsilon or len(d) < 4: theta = np.random.randn(possible_actions) else: theta = model.predict(np.array(d).reshape(1, 4, height, width))[0] # action is a single value, namely max from an output like [0.00147357 0.00367402 0.00365852 0.00317618] action = np.argmax(theta) # action = env.action_space.sample() # take an action and record the results observation, reward, done, info = env.step(action) # since observation is 210 x 160 pixel image, resizing to 84 x 84 observation = cv2.resize(observation, (height, width)) # converting image to grayscale observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY) # d is a queue of 4 frames that i pass as an input to the model d.append(observation) if len(d) > 4: d.popleft() # for gen 0 , since model hasn't been trained yet, Q_sa is set to zeros or random # or i dont yet have 4 frames to call predict if gen == 0 or len(d) < 4: Q_sa = np.zeros(possible_actions) else: Q_sa = model.predict(np.array(d).reshape(1, 4, height, width))[0] # this one is just total score after each game score += reward if not done: Q = reward + gamma*np.amax(Q_sa) else: Q = reward # instead of mask, i just used list comparison to multiply with Q values # theta is onehot after this, like [0. 0. 0. 0.00293484] theta = (theta == np.amax(theta)) * 1 * Q # only appending those actions, for which some reward was generated # otherwise dataset becomes mostly zeros and model is 99 % accurate by just predicting zeros if len(prev_observation) > 0 and len(d) == 4 np.sum(theta) > 0: data.append([d, theta]) prev_observation = observation if done: break print('gen {1} game {0}: '.format(i, gen) + str(score)) # only taking those games for which total score at the end of game was above accpetable score if score >= accepted_score: for d in data: training_data.append(d) env.reset() return training_data # exact model described in DeepMind paper, just added a layer to end for 18 to 4 def simple_model(width, height, num_frames, lr, output=9, model_name='intelAI.model'): network = input_data(shape=[None, num_frames, width, height], name='input') conv1 = conv_2d(network, 8, 32,strides=4, activation='relu', name='conv1') conv2 = conv_2d(conv1, 4, 64, strides=2, activation='relu', name='conv2') conv3 = conv_2d(conv2, 3, 64, strides=1, activation='relu', name='conv3') fc4 = fully_connected(conv3, 512, activation='relu') fc5 = fully_connected(fc4, 18, activation='relu') fc6 = fully_connected(fc5, output, activation='relu') network = regression(fc6, optimizer='adam', loss='mean_square', learning_rate=lr, name='targets') model = tflearn.DNN(network, max_checkpoints=0, tensorboard_verbose=0, tensorboard_dir='log') return model # defining/ declaring the model model = simple_model(width, height, 4, LR, possible_actions) # this function is responsible for training the model def train2play(training_data): X = np.array([i[0] for i in training_data]).reshape(1, 4, height, width) Y = [i[1] for i in training_data] # X is the queue of 4 frames model.fit({'input': X}, {'targets': Y}, n_epoch=5, snapshot_step=500, show_metric=True, run_id='openai_learning') # repeating the whole process in terms of generations # training again and again after playing for set number of games for gen in range(generations): training_data = play4data(gen) np.random.shuffle(training_data) train2play(training_data) model.save(MODEL_NAME.format(game))

Machine learning: specific strategy learned because of playing against specific agent?
First of all I found difficulties formulating my question, feedback is welcome.
I have to make a machine learning agent to play dots and boxes.
I'm just in the early stages but came up with the question: if I let my machine learning agent (with a specific implementation) play against a copy of itself to learn and improve it's gameplay, wouldn't it just make a strategy against that specific kind of gameplay?
Would it be more interesting if I let my agent play and learn against different forms of other agents in an arbitrary fashion?

SemiBatch learning for weight update? QValueFunctionLearning
I was able to load my matrices into memory after I modified my feature space into only 1/0 entries. But when I was doing LSTDStep (LSTDQ) due to too big matrices, I always get "interrupted by signal 9: SIGKILL". My matrices are: f' (1099x800,000, dtype=bool), fgamma*fn (800,000x1099, dtype=float).
I assume that was because of memory/cache issues. And with my laptop this issue could never be solved. Because if I insists on using pure BatchLearning, my matrix would be like 1099 * 400,000,000 and 400,000,000 x 1099 big.
I could use iterative training though, but it doesn't use the training set effectively. So is there any semibatch learning method for weightupdate?

How can I access the model’s attribution created during forward pass when using dataparallel?
For example, I have such a forward function():
def forward(self, x): mgdata = self.encoder(x) self.imp_mask_sigmoid = self.impmap_sigmoid(mgdata) self.imp_mask = self.impmap_expand(self.imp_mask_sigmoid) enc_data = mgdata * self.imp_mask dec_data = self.decoder(enc_data) return dec_data
When I using dataparallel, I always using
model.module
to access the wrapped original model.However, I found I can't accessmodel.module.imp_mask_sigmoid
, it will complain aboutAttributeError: 'ContentWeightedCNN' object has no attribute 'imp_mask_sigmoid'
. How to deal with it?Thanks very much.

PyTorch forward pass using weights trained by Theano
I've trained a small size CNN binary classifier in Theano. To have a simpler code, I wanted to port the trained weights to PyTorch or numpy forward pass for predictions. The predictions by original Theano program are satisfying but the PyTorch forward pass predicted all the examples to one class.
Here is how I save trained weights in Theano using h5py:
layer0_w = layer0.W.get_value(borrow=True) layer0_b = layer0.b.get_value(borrow=True) layer1_w = layer1.W.get_value(borrow=True) layer1_b = layer1.b.get_value(borrow=True) layer2_w = layer2.W.get_value(borrow=True) layer2_b = layer2.b.get_value(borrow=True) sm_w = layer_softmax.W.get_value(borrow=True) sm_b = layer_softmax.b.get_value(borrow=True) h5_l0w = h5py.File('./model/layer0_w.h5', 'w') h5_l0w.create_dataset('layer0_w', data=layer0_w) h5_l0b = h5py.File('./model/layer0_b.h5', 'w') h5_l0b.create_dataset('layer0_b', data=layer0_b) h5_l1w = h5py.File('./model/layer1_w.h5', 'w') h5_l1w.create_dataset('layer1_w', data=layer1_w) h5_l1b = h5py.File('./model/layer1_b.h5', 'w') h5_l1b.create_dataset('layer1_b', data=layer1_b) h5_l2w = h5py.File('./model/layer2_w.h5', 'w') h5_l2w.create_dataset('layer2_w', data=layer2_w) h5_l2b = h5py.File('./model/layer2_b.h5', 'w') h5_l2b.create_dataset('layer2_b', data=layer2_b) h5_smw = h5py.File('./model/softmax_w.h5', 'w') h5_smw.create_dataset('softmax_w', data=sm_w) h5_smb = h5py.File('./model/softmax_b.h5', 'w') h5_smb.create_dataset('softmax_b', data=sm_b)
Then load the weights to build a forward pass using Pytorch and Numpy:
import torch import numpy as np import torch.nn.functional as F def model(data): conv0_out = F.conv2d(input=np2var(data), weight=np2var(layer0_w), bias=np2var(layer0_b) ) layer0_out = relu(var2np(conv0_out)) conv1_out = F.conv2d(input=np2var(layer0_out), weight=np2var(layer1_w), bias=np2var(layer1_b) ) layer1_out = np.max(relu(var2np(conv1_out)), axis=2) dense_out=relu(np.matmul(layer1_out, layer2_w) + layer2_b) softmax_out = softmax(np.matmul(dense_out, softmax_w) + softmax_b) return softmax_out def relu(x): return x * (x > 0) def np2var(x): return torch.autograd.Variable(torch.from_numpy(x)) def var2np(x): return x.data.numpy() def softmax(x): e_x = np.exp(x  np.max(x)) return e_x / e_x.sum()
The input and kernel shapes for conv2d functions are the same for Theano and PyTorch, and network structures in two frameworks are the same. I couldn't detect any errors step by step. What could go wrong here?

How can I update the parameters of a neural network in PyTorch?
Let's say I wanted to multiply all parameters of a neural network in PyTorch (a class inheriting from
torch.nn.Module
) by0.9
. How would I do that? 
ValueError: None values not supported.  Keras Custom Loss Function
Im getting this error on my custom loss function : ValueError: None values not supported.
The output is a 2d array of binary digits (2000,2). I need to calculate the accuracy on certain entries only (selected by the NN), not the entire set.
Here's my loss function and how i did it:
def getBalance(x_true, x_pred): ##Selected entries for calculation x_pred = K.tf.round(x_pred) trade_index = K.tf.not_equal(x_pred[:,1], 0 ) x_true_tradeable = K.tf.boolean_mask(x_true[:,0], trade_index) x_pred_tradeable = K.tf.boolean_mask(x_pred[:,0], trade_index) Accuracy = K.mean(K.equal(x_true_tradeable, x_pred_tradeable)) return Accuracy
Here is my model for reference:
model = Sequential() model.add(Dropout(0.4, input_shape=(train_input_data_NN.shape[1], train_input_data_NN.shape[2]))) model.add(LSTM(30, dropout=0.4, recurrent_dropout=0.4, return_sequences=False)) model.add(Dense(2, activation='sigmoid')) model.compile(loss='getBalance', optimizer='adam') history = model.fit(train_input_data_NN, outputs_NN, epochs=50, batch_size=64, verbose=1, validation_data=(test_input_data_NN, outputs_NN_test))

How does keras handle multiple losses?
So my question is, if I have something like:
model = Model(inputs = input, outputs = [y1,y2]) l1 = 0.5 l2 = 0.3 model.compile(loss = [loss1,loss2], loss_weights = [l1,l2], ...)
What does keras do with the losses to obtain the final loss? Is it something like:
final_loss = l1*loss1 + l2*loss2
Also, what does it mean during training? Is the loss2 only used to update the weights on layers where y2 comes from? Or is it used for all the model's layers?
I'm pretty confused

How to implement custom Keras regulizer with output of another model as a parameter?
I'm trying to replicate this article: https://arxiv.org/pdf/1705.08302.pdf
Basically, a fully convolutional network (FCN) does voxel level predictions on a patch of a image, then, this patch and its respective labels are passed through an autoencoder and then compared to evaluate the "global shape" of the predictions.
So the loss function (eq. (1) page 4) is a linear combination between the cross entropy from the FCN and euclidean distance loss from the autoencoder.
Problem:
I have a working FCN and a working autoencoder, my problem has been implementing this loss function in Keras/tensorflow. So, how can I do that?
This is what I tried so far (without third term of equation) but gives wrong results:
def euclidean_distance_loss(y_true, y_pred): from keras import backend as K return K.sqrt(K.sum(K.square(y_pred  y_true))) def ACNN_loss(l1, autoencoder): from keras import backend as K def loss(y_true, y_pred): ae_seg = autoencoder(y_pred) ae_gt = autoencoder(y_true) Lhe = K.sqrt(K.sum(K.square(ae_seg  ae_gt))) Lx = K.binary_crossentropy(y_true, y_pred) return Lx + (l1 * Lhe) return loss l1 = 0.01 ae_path = #path of my autoencoder model and its weights autoencoder = keras.models.load_model(os.path.join(ae_path,'model.h5'), custom_objects={'euclidean_distance_loss': euclidean_distance_loss}) autoencoder.load_weights(os.path.join(ae_path,'weigths.h5')) model.compile(loss = ACNN_loss(l1, autoencoder), optimizer = keras.optimizers.Adam(lr=0.0003, beta_1=0.9, beta_2=0.999, epsilon=1e08, decay=0.0), metrics= ['accuracy', keras.metrics.binary_crossentropy] )
This is my first question so sorry if I messed up on any requirements. Thanks in advance

reinforcement learning (policy iteration, value iteration, SARSA, Qlearning)
I am new in reinforcement learning I have gone through the theories of Value iteration and policy iteration of dynamic programming, monte carlo planning, temporal difference etc. everywhere, I find either theories or python example which is not satisfactory as a beginner. I just need to understand a simple example for understanding the step by step iterations. Could anyone please show me the 1st and 2nd iterations for the example image that I have uploaded for value iteration, policy iteration, generalized policy iteration, SARSA, and Qlearning?
https://i.imgur.com/0NIT3yN.png "gridword problem"

Reinforcement Learning: Q and Q(λ) speed difference on Windy Grid World environment
Preface:
I have attempted to solve this WindyGridWorld env. Having implemented both Q and Q(λ) algorithm, the results are pretty much the same (I am looking at steps per episode).
Problem:
From what I have read, I believe that a higher lambda parameter should update more states further back leading up to it; therefore, the amount of steps should decrease much more dramatically than regular Qlearning. This image shows what I am talking about.
Is this normal for this environment or have I implemented it wrong?
Code:
import matplotlib.pyplot as plt import numpy as np from lib.envs.windy_gridworld import WindyGridworldEnv from collections import defaultdict env = WindyGridworldEnv() def epsilon_greedy_policy(Q, state, nA, epsilon): ''' Create a policy in which epsilon dictates how likely it will take a random action. :param Q: links state > action value (dictionary) :param state: state character is in (int) :param nA: number of actions (int) :param epsilon: chance it will take a random move (float) :return: probability of each action to be taken (list) ''' probs = np.ones(nA) * epsilon / nA best_action = np.argmax(Q[state]) probs[best_action] += 1.0  epsilon return probs def Q_learning_lambda(episodes, learning_rate, discount, epsilon, _lambda): ''' Learns to solve the environment using Q(λ) :param episodes: Number of episodes to run (int) :param learning_rate: How fast it will converge to a point (float [0, 1]) :param discount: How much future events lose their value (float [0, 1]) :param epsilon: chance a random move is selected (float [0, 1]) :param _lambda: How much credit to give states leading up to reward (float [0, 1]) :return: x,y points to graph ''' # Link state to action values Q = defaultdict(lambda: np.zeros(env.action_space.n)) # Eligibility trace e = defaultdict(lambda: np.zeros(env.action_space.n)) # Points to plot # number of episodes x = np.arange(episodes) # number of steps y = np.zeros(episodes) for episode in range(episodes): state = env.reset() # Select action probs = epsilon_greedy_policy(Q, state, env.action_space.n, epsilon) action = np.random.choice(len(probs), p=probs) for step in range(10000): # Take action next_state, reward, done, _ = env.step(action) # Select next action probs = epsilon_greedy_policy(Q, next_state, env.action_space.n, epsilon) next_action = np.random.choice(len(probs), p=probs) # Get update value best_next_action = np.argmax(Q[next_state]) td_target = reward + discount * Q[next_state][best_next_action] td_error = td_target  Q[state][action] e[state][action] += 1 # Update all states for s in Q: for a in range(len(Q[s])): # Update Q value based on eligibility trace Q[s][a] += learning_rate * td_error * e[s][a] # Decay eligibility trace if best action is taken if next_action is best_next_action: e[s][a] = discount * _lambda * e[s][a] # Reset eligibility trace if random action taken else: e[s][a] = 0 if done: y[episode] = step e.clear() break # Update action and state action = next_action state = next_state return x, y
You can check out my Jupyter Notebook here if you would like to see the whole thing.

Replicating ntuple strategy for playing 2048 which significantly inferior results  any guidance? Full code inside
I have created an agent for playing 2048 using the ntuple approach (mimicked from the paper Temporal Difference Learning of NTuple Networks for the Game 2048. I have replicated their approach to the best of my knowledge but the results are extremely underwhelming. My agent has trained over 130,000 games but is still only hitting the 1024 tile about 7% of the time. I know training takes time but according to the paper their agent won the game >80% of the time using the strategy I am attempting to replicate after only 100,000 games. I am using the approach with the 2 3tuples and 2 2tuples with symmetric sampling as shown in the paper in Figure 10. Does anyone have any guidance as to what I may be doing wrong? Any and all help is appreciated.
Python Code:
# TDAfterstate from random import randint, random import numpy as np from copy import copy import sys import time from datetime import timedelta from openpyxl import Workbook, load_workbook import pickle import math import os.path class Game: def __init__(self, board): self.height = 4 self.width = 4 self.actions = [0, 1, 2, 3] self.total_score = 0 if board is not None: self.board = board def empty_tiles(self): # Identifies the coordinates of empty tiles return (self.board==0).nonzero() def add_tile(self): # Inserts a new tile into an empty space zero_coords = self.empty_tiles() if len(zero_coords[0]) > 0: # Determine new tile value new_tile = 2 if random() > .1 else 4 # Determine index to insert new tile rand_idx = randint(0, len(zero_coords[0])  1) # Insert new tile self.board[zero_coords[0][rand_idx], zero_coords[1][rand_idx]] = new_tile def starting_board(self): self.board = np.zeros((4,4), dtype=np.int32) self.add_tile() self.add_tile() def print_board(self,nl = True): # Displays the board print("+"*self.width + "+") for i in range(self.height): for j in range(self.width): if self.board[i][j] == 0: print(" ", end='') else: print("%4d "%(self.board[i][j]), end='') print("") print("+"*self.width + "+") if nl: print("") sys.stdout.flush() def swipe(self, direction): # The swipe method is designed to swipe left  Other directions must be rotated first rotated_board = np.rot90(self.board,direction) # Variable for tracking points to add to total score new_points = 0 for row in rotated_board: # tile1 and tile2 are adjacent tiles tile1 = 0 idx = 0 # Index nz = np.append(row[row != 0],1) # Nonzero values # Parse through nonzero tiles for tile2 in nz: if tile2 == 1: if tile1 != 0: row[idx] = tile1 idx += 1 # Compare next tiles if one is empty elif tile1 == 0: tile1 = tile2 # Tiles are identical elif tile1 == tile2: combined_tile = tile1 + tile2 row[idx] = combined_tile new_points += combined_tile self.total_score += combined_tile idx += 1 tile1 = 0 # Different tiles else: row[idx] = tile1 tile1 = tile2 idx += 1 # Mark empty tiles in the row while idx < 4: row[idx] = 0 idx += 1 return new_points def done(self): """ Determines if the board admits further moves. """ if (self.board == 0).any(): return False if (self.board[1:] == self.board[:1]).any(): return False if (self.board[:,1:] == self.board[:,:1]).any(): return False return True def tups(board): # Returns the exponent of the tiles for each tuple for symmetric sampling all_tups = [] exp_board = board.copy() nz = exp_board.nonzero() exp_board[nz] = np.log2(exp_board[nz]) boards = [] t_board = np.transpose(exp_board) for i in range(4): boards.append(np.rot90(exp_board,i)) boards.append(np.rot90(t_board,i)) for board in boards: tup1 = tuple(board[0:3,0:2].flatten()) tup2 = tuple(board[0:3,1:3].flatten()) tup3 = tuple(board[:,2]) tup4 = tuple(board[:,3]) all_tups.append((tup1,tup2,tup3,tup4)) return all_tups def f(state, V1, V2, V3, V4): # f approximates thevalue function V all_tups = tups(state) score = 0 for tup_set in all_tups: score = score + V1[tup_set[0]] score = score + V2[tup_set[1]] score = score + V3[tup_set[2]] score = score + V4[tup_set[3]] return score def choose_action(state, V1, V2, V3, V4): Vs = [] for action in range(4): state_copy = copy(state) swiped_game = Game(state_copy) reward = swiped_game.swipe(action) n_state = copy(swiped_game.board) if (n_state != state).any(): score = reward + f(n_state, V1, V2, V3, V4) Vs.append(score) else: Vs.append(math.inf) action_order = list(reversed(np.argsort(Vs))) action = action_order[0] return action def learn_evaluation(V1, V2, V3, V4, n_state, nn_state, learning_rate): n_action = choose_action(nn_state, V1, V2, V3, V4) temp_game = Game(nn_state) n_reward = temp_game.swipe(n_action) nnn_state = copy(temp_game.board) f_nnn_state = f(nnn_state, V1, V2, V3, V4) f_n_state = f(n_state, V1, V2, V3, V4) all_tups = tups(n_state) for tup_set in all_tups: V1[tup_set[0]] = V1[tup_set[0]] + learning_rate*(n_reward + f_nnn_state  f_n_state) V2[tup_set[1]] = V2[tup_set[1]] + learning_rate*(n_reward + f_nnn_state  f_n_state) V3[tup_set[2]] = V3[tup_set[2]] + learning_rate*(n_reward + f_nnn_state  f_n_state) V4[tup_set[3]] = V4[tup_set[3]] + learning_rate*(n_reward + f_nnn_state  f_n_state) def create_workbook(path): wb = Workbook() ws = wb.create_sheet(title="Model Output") wb.save(filename=path) # paths path_model_output = "C:...\\2048\\N_tuples_symm_reward.xlsx" path_base_V1 = "C:...\\2048\\V_symm\\V1r\\V1r_" path_base_V2 = "C:...\\2048\\V_symm\\V2r\\V2r_" path_base_V3 = "C:...\\2048\\V_symm\\V3r\\V3r_" path_base_V4 = "C:...\\2048\\V_symm\\V4r\\V4r_" if not os.path.isfile(path_model_output): create_workbook(path_model_output) # Learning old_time = time.time() V1 = np.zeros(shape=(11,11,11,11,11,11), dtype = np.int16) V2 = np.zeros(shape=(11,11,11,11,11,11), dtype = np.int16) V3 = np.zeros(shape=(11,11,11,11), dtype = np.int16) V4 = np.zeros(shape=(11,11,11,11), dtype = np.int16) n_2048 = 0 n_1024 = 0 n_512 = 0 n_256 = 0 num_games = 1000000 learning_enabled = True learning_rate = 0.0025 wb = load_workbook(filename=path_model_output) ws = wb.active for game_num in range(1,num_games+1): game = Game(None) game.starting_board() moves = 0 score = 0 while not game.done() and np.amax(game.board) < 2048: state = copy(game.board) # Choose action action = choose_action(state, V1, V2, V3, V4) # Find reward, n_state, nn_state reward = game.swipe(action) n_state = copy(game.board) game.add_tile() nn_state = copy(game.board) # Update LUTs if learning_enabled: #symmetric_learning(V, all_tups, n_state, nn_state, learning_rate) learn_evaluation(V1, V2, V3, V4, n_state, nn_state, learning_rate) score = score + reward state = nn_state max_tile = np.amax(game.board) if max_tile == 256: n_256 = n_256 + 1 elif max_tile == 512: n_512 = n_512 + 1 elif max_tile == 1024: n_1024 = n_1024 + 1 elif max_tile == 2048: n_2048 = n_2048 + 1 if game_num % 100 == 0: t = round(time.time()  old_time, 2) old_time = time.time() print("Game Number: {}; 2048: {}; 1024: {}; 512: {}, 256: {}; Time Elapsed: {}".format(\ game_num, str(n_2048)+"%", str(n_1024)+"%", str(n_512)+"%", str(n_256)+"%", str(timedelta(seconds=t)))) n_2048 = 0 n_1024 = 0 n_512 = 0 n_256 = 0 if game_num % 1000 == 0: path1 = path_base_V1 + str(game_num) + ".pickle" pickle.dump(V1,open(path1,"wb")) path2 = path_base_V2 + str(game_num) + ".pickle" pickle.dump(V2,open(path2,"wb")) path3 = path_base_V3 + str(game_num) + ".pickle" pickle.dump(V3,open(path3,"wb")) path4 = path_base_V4 + str(game_num) + ".pickle" pickle.dump(V4,open(path4,"wb")) results = (game_num, str(n_2048)+"%", str(n_1024)+"%", str(n_512)+"%", str(n_256)+"%") ws.append(results) wb.save(filename=path_model_output)

Tensorflow loss is already low
I'm doing an AI with reinforcement learning and i'm getting weird results, the loss shows like this: Tensorflow loss: https://imgur.com/a/Twacm
And while it's training, after each game, it's playing against a random player and after a player with a weighted matrix, but it goes up and down: results: https://imgur.com/a/iGuu2
Basically i'm doing a reinforcement learning agent that learns to play Othello. Using Egreedy, Experience replay and deep networks using Keras over Tensorflow. Tried different architectures like sigmoid, relu and in the images shown above, tanh. All them have similar loss but the results are a bit different. In this exemple the agent is learning from 100k professional games. Here is the architecture, with default learning rate as 0.005:
model.add(Dense(units=200,activation='tanh',input_shape=(64,))) model.add(Dense(units=150,activation='tanh')) model.add(Dense(units=100,activation='tanh')) model.add(Dense(units=64,activation='tanh')) optimizer = Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=1e08, decay=0.0) model.compile(loss=LOSS,optimizer=optimizer)
Original code: https://github.com/JordiMD92/thellia/tree/keras
So, why i get these results? Now my input is 64 neurons (8*8 matrix), with 0 void square, 1 black square and 1 white square. Is it bad to use negative inputs?

Why does my implementation of the negamax alphabeta not work?
I am working on a c++ negamax implementation with alphabeta pruning, but it seems to not be working properly, and I am stumped.\n It even returns different values from the starting position, which should have identical values for every move. The method is called by the following code:\n
score = alphaBeta(INFINITY, INFINITY, depth, testBoard, White);
. Here is my code:float alphaBeta(float alpha, float beta, int depthleft, BitBoards currentBoard, int color) { //Negamax implementation of alphabeta search: //Prepare copy of board currentBoards[depthleft] = currentBoard; //If at leaf node, return heuristic value if (depthleft == 0) { return evaluation(¤tBoard); } //Are there any moves that can be played? possibleMoves = FindMoves(color, ¤tBoard); if (possibleMoves == 0x0ull) { //If there are no moves that can be played, pass for a turn. color = color; possibleMoves = FindMoves(color, ¤tBoard); //Can the opponent play any moves? if (possibleMoves == 0x0ull) { //If the opponent cannot play anything either, return outcome of game. if (Hammingweight(currentBoard.BlackPieces) > Hammingweight(currentBoard.WhitePieces)) { return 100000; } if (Hammingweight(currentBoard.BlackPieces) < Hammingweight(currentBoard.WhitePieces)) { return 100000; } else { return 0; } } } //Find total possible moves and loop through them. moves = Hammingweight(possibleMoves); for (uint64b j = 0; j < moves; j++) { //Choose next move to test. moveSpot = FindNextMove(possibleMoves); possibleMoves &= ~moveSpot; //Prepare scratch work board testBoard = currentBoards[depthleft]; //Make the move being considered MakeMove(color, moveSpot, &testBoard); //Find alphabeta heuristic value of next node score = alphaBeta(beta, alpha, depthleft  1, testBoard, color); if (score >= beta) { //Fail hard beta cutoff return beta; } if (score > alpha) { alpha = score; } } return alpha; }

java reversi/othello AI, changed AI position can't be saved in array
the Gui is given, but can't be seen. Accoding to the order, myself can be KIPlayer or a person. This KIPlayer(AIPlayer) implements the Interface Player(). Inside of Player(),there are two methods: init() and nextMove().
my question is in the nextMove().
my logik of nextMove():
public Move nextMove(Move prevMove, long tOpponent, long t ): else{
save prevMove from rival in array
save changed position of prevMove in array // it works
save myself in array
save changed position of myself in array// here doesn't work, in the code it is: change(bestMove.x,bestMove.y,rival);
without the change(...) here, it works, but each time the changed position won't be saved, so this AIplayer will lose.if the change(...) code stays here, in the console, till this step, it doesn't work any move. how to fix it?Where should I put the change(for KIPlayer) in my code?} return bestMove;
package ki; import szte.mi.Player; import szte.mi.Move; import szte.mi.*; import java.util.ArrayList; import java.util.List; public class KIPlayer implements Player { public int[][] array; public int myself; public int rival; int sum=0; int bestX=1; int bestY=1; Move bestMove; public void init( int order, long t, java.util.Random rnd ) { this.array=new int [8][8]; array[3][3]=2; array[3][4]=1; array[4][3]=1; array[4][4]=2; if(order==0){ myself=1;//black rival=2; } else if(order==1){ myself=2;//white rival=1; } System.out.println("Init called"+"myself is"+myself); } public Move nextMove(Move prevMove, long tOpponent, long t ) { if(prevMove==null) { for(int i=0;i<8;i++) { for(int j=0;j<8;j++) { if(array[3][3]==2 && array[3][4]==1 && array[4][3]==1 && array[4][4]==2 && array[i][j]==0) { //the first move myself=1; rival=2; } else { switchPlayer(myself); } } } } else { // rival array[prevMove.x][prevMove.y]=this.rival; change(prevMove.x, prevMove.y, this.rival); // KI / AI bestMove= legalMove(rival,myself); change(bestMove.x,bestMove.y,rival);// probelm here!it doens't work //here is the question } return bestMove; //return legalMove(this.rival, this.myself); } public void switchPlayer(int myself) { if(myself==1) { myself=2; rival=1; } else if(myself==2) { myself=1; rival=2; } } public int getWeight(int x, int y) { int weight[][]= new int[][]{{90,60,10,10,10,10,60,90}, {60,80,5,5,5,5,80,60}, {10,5,1,1,1,1,5,10}, {10,5,1,1,1,1,5,10}, {10,5,1,1,1,1,5,10}, {10,5,1,1,1,1,5,10}, {60,80,5,5,5,5,80,60}, {90,60,10,10,10,10,60,90}}; return weight[x][y]; } public Move legalMove(int rival, int myself) { int Max= 500; for (int x=0;x<8;x++) { for(int y=0;y<8;y++) { if(this.array[x][y]==this.myself) {//position with same color of the player System.out.println("myself in legalMove: "+ x+"+"+y); int neighbour[][]= new int[3][3]; neighbour[1][1] = array[x][y]; // myself neighbour[1][0] = array[x][y1]; //up neighbour[1][2] = array[x][y+1]; //down neighbour[0][1] = array[x1][y]; //left neighbour[2][1] = array[x+1][y]; //right neighbour[0][0] = array[x1][y1];//left upper neighbour[2][0] = array[x+1][y1];//right upper neighbour[0][2] = array[x1][y+1];//left down neighbour[2][2] = array[x+1][y+1];//right down //} for(int j=0;j<3;j++) { for(int i=0; i<3; i++) { if(neighbour[i][j]!=20 && neighbour[i][j]!= myself ){ //3x3 int NeighborX= x+(i1); int NeighborY= y+(j1); System.out.println("3x3 X:"+NeighborX+",Y: "+NeighborY); if(array[NeighborX][NeighborY]==rival) { while(array[NeighborX][NeighborY] == rival) { //neighbor extending > edge NeighborX=NeighborX+(i1); NeighborY=NeighborY+(j1); System.out.println("neighborX"+NeighborX+"neighborY"+NeighborY +"array[X][Y]"+array[NeighborX][NeighborY]); if(array[NeighborX][NeighborY]==myself NeighborX>=7  NeighborY>=7  NeighborX<=0  NeighborY<= 0) { //?????? break; } } if(array[NeighborX][NeighborY]==0) { //whether the Extended neighbor is same as myself(m,n) if(getWeight(NeighborX,NeighborY)>=Max ) { Max=getWeight(NeighborX,NeighborY); bestMove = new Move(NeighborX, NeighborY); } } } } } } } } } array[bestMove.x][bestMove.y]=this.myself; return bestMove; } public void change(int x, int y, int rival) { int neighborX=10; int neighborY=10; int extendedNeighborX=10; int extendedNeighborY=10; for(int i=0;i<3;i++) { for(int j=0;j<3;j++) { neighborX= x+(i1); neighborY= y+(j1); if(neighborX>=0 && neighborY>=0 && neighborX<8 && neighborY<8) { if(array[neighborX][neighborY]==myself){ while(array[neighborX][neighborY]==myself) { neighborX= neighborX+(i1); neighborY= neighborY+(j1); if(neighborX>7  neighborY>7  neighborX<0  neighborY< 0) { break; } } extendedNeighborX= neighborX; extendedNeighborY= neighborY; while(extendedNeighborX<8 && extendedNeighborY<8 && extendedNeighborX>=0 && extendedNeighborY>=0 && array[extendedNeighborX][extendedNeighborY]==array[x][y] && !( extendedNeighborX==x && extendedNeighborY==y)) { // if a player enter() legally extendedNeighborX=extendedNeighborX(i1); extendedNeighborY=extendedNeighborY(j1); this.array[extendedNeighborX][extendedNeighborY]=this.array[x][y]; } } } } } }
}