I construct an LSTM model and want to optimize the hyperparameters using particle swarm optimization. Since I am not familiar with parallel computation, I hope the computation could be economical in time and money. Refer to text, I construct the following code.
Creating a model:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import TimeDistributed
from keras import optimizers
def create_lstm(layn, uni, drp, lnr, clpn):
regressor = Sequential()
for k in range(layn):
if k == 0:
regressor.add(LSTM(units = uni, return_sequences = True,
input_shape = (None, 1)))
else:
regressor.add(LSTM(units = uni, return_sequences = True))
regressor.add(Dropout(drp))
# Adding the output layer
regressor.add(TimeDistributed(Dense(units = 1)))
# Compiling
opt1 = optimizers.Adam(learning_rate=lnr,global_clipnorm=clpn)
regressor.compile(optimizer = opt1, loss="mean_squared_error")
return regressor
Training and computing the fitness:
from keras.callbacks import EarlyStopping
import tensorflow as tf
def train_model(params):
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
model = create_lstm(params['layn'],params['uni'],params['drp'],
params['lnr'], params['clpn'])
callback = EarlyStopping(monitor="val_loss", patience=params['pat'], restore_best_weights=True)
history = model.fit(params['x'], params['y'], epochs = 10000, batch_size = params['bs'],
sample_weight=params['sw'],
validation_data=(params['x_val'], params['y_val'], params['vsw']), callbacks=[callback])
# computing fitness
fitness = ...
return model_and_fitness
Part of my particle swarm optimization algorithm:
import random
import math
import multiprocessing as mp
def pso_lstm(x, y, x_val, y_val, x_test, y_test, sw, vsw):
nop = 30 # number of particles
maxit = 100 # maximum iteration number
nod = 6 # number of dimensions
c1, c2 = 2.0, 2.0
# boundary of hyperparameter
bon = np.array([[200, 1200], # unit number
[0.0, 1.0], # dropout rate
[-7.0, math.log10(0.5)], # log10(learning rate)
[-7.0, math.log10(0.5)], # log10(global clipnorm)
[50, 1000], # patience
[3, 10]]) # log2(batch size)
pos = np.zeros((nod, nop))
vel = np.zeros((nod, nop))
pbest = np.zeros((nod, nop))
gbest = np.zeros((nod, nop))
pbest_fit = 1E20 * np.ones(nop)
gbest_fit = 1E20
params_list = []
# 起始化粒子位置與速度
for i in range(nop): #粒子數
pos[0, i] = random.randrange(200, 1201) # unit number
pos[1, i] = random.uniform(0.2, 0.5) # dropout rate
pos[2, i] = random.uniform(-7.0, math.log10(0.5)) # log10(learning rate)
pos[3, i] = random.uniform(-7.0, math.log10(0.5)) # log10(global clipnorm)
pos[4, i] = random.randrange(50, 1001) # patience
pos[5, i] = random.randrange(3, 11) # log2(batch size)
d1 = {'x': x, 'y': y, 'x_val': x_val, 'y_val': y_val,
'x_test': x_test, 'y_test': y_test, 'sw': sw, 'vsw': vsw,
'layn': 3,
'uni': round(pos[0, i]),
'drp': pos[1, i],
'lnr': 10 ** pos[2, i],
'clpn': 10 ** pos[3, i],
'pat': round(pos[4, i]),
'bs': 2 ** round(pos[5, i])}
params_list.append(d1)
gbest_fit2 = np.zeros(maxit)
for j in range(maxit):
pool = mp.Pool(processes=mp.cpu_count())
results = pool.map(train_model, params_list)
# update the optimal personal fitness and position
...
# update the optimal group fitness and position
...
# update particle velocities and positions
...
# modify params_list
...
return optimal_model
The code related to parallel computation is
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
in train_model and
pool = mp.Pool(processes=mp.cpu_count())
results = pool.map(train_model, params_list)
in pso_lstm.
Initially, I hope to train each model faster, and train models in parallel. However, I’m not clear about these two pieces of code, as well as the terminology, so I’m not clear how Google Colab allocates the GPU. When I executed mp.cpu_count()
, I get 12. I also tried setting nop = 1
and maxit = 1
(train a model), and set unit=1200, patience=500, batch_size=1024 (with x.shape=(1763, 167, 1)), the computation takes more than 3 hours, and when echo=6000, the validation loss is reduced compared to when the echo is hundreds, but the available GPUs went from ~50 to 0. Because I am not familiar with Colab, is there a more cautious and efficient way to execute it? I think I can reduce and separate the patience and find the more efficient hyperparameters. Could you please give more suggestions to modify the code?
In addition, does the application of tf.distribute.MirroredStrategy() perform parallel computations on sub-batches? If it is possible to perform parallel computations on the gradient of the loss for each timestep?
Thank you very much!